From 9fa274133883d59ff33534106300cf7b7bc52e92 Mon Sep 17 00:00:00 2001 From: Juraj Oravec Date: Thu, 13 Oct 2022 21:16:53 +0200 Subject: [PATCH] Update Readability.jks to latest version from git Signed-off-by: Juraj Oravec --- readability/data/Readability.js | 937 +++++++++++++++++++++++--------- 1 file changed, 684 insertions(+), 253 deletions(-) diff --git a/readability/data/Readability.js b/readability/data/Readability.js index 45048d5..9117a59 100644 --- a/readability/data/Readability.js +++ b/readability/data/Readability.js @@ -36,6 +36,7 @@ function Readability(doc, options) { options = options || {}; this._doc = doc; + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; this._articleTitle = null; this._articleByline = null; this._articleDir = null; @@ -48,37 +49,43 @@ function Readability(doc, options) { this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._keepClasses = !!options.keepClasses; + this._serializer = options.serializer || function(el) { + return el.innerHTML; + }; + this._disableJSONLD = !!options.disableJSONLD; // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | - this.FLAG_WEIGHT_CLASSES | - this.FLAG_CLEAN_CONDITIONALLY; + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; - var logEl; // Control whether log messages are sent to the console if (this._debug) { - logEl = function(e) { - var rv = e.nodeName + " "; - if (e.nodeType == e.TEXT_NODE) { - return rv + '("' + e.textContent + '")'; + let logNode = function(node) { + if (node.nodeType == node.TEXT_NODE) { + return `${node.nodeName} ("${node.textContent}")`; } - var classDesc = e.className && ("." + e.className.replace(/ /g, ".")); - var elDesc = ""; - if (e.id) - elDesc = "(#" + e.id + classDesc + ")"; - else if (classDesc) - elDesc = "(" + classDesc + ")"; - return rv + elDesc; + let attrPairs = Array.from(node.attributes || [], function(attr) { + return `${attr.name}="${attr.value}"`; + }).join(" "); + return `<${node.localName} ${attrPairs}>`; }; this.log = function () { if (typeof dump !== "undefined") { var msg = Array.prototype.map.call(arguments, function(x) { - return (x && x.nodeName) ? logEl(x) : x; + return (x && x.nodeName) ? logNode(x) : x; }).join(" "); dump("Reader: (Readability) " + msg + "\n"); } else if (typeof console !== "undefined") { - var args = ["Reader: (Readability) "].concat(arguments); + let args = Array.from(arguments, arg => { + if (arg && arg.nodeType == this.ELEMENT_NODE) { + return logNode(arg); + } + return arg; + }); + args.unshift("Reader: (Readability)"); console.log.apply(console, args); } }; @@ -114,11 +121,11 @@ Readability.prototype = { REGEXPS: { // NOTE: These two regular expressions are duplicated in // Readability-readerable.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, - okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, @@ -127,11 +134,19 @@ Readability.prototype = { shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, + tokenize: /\W+/g, whitespace: /^\s*$/, hasContent: /\S$/, + hashUrl: /^#.+/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ }, - DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], + UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + + DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], @@ -142,29 +157,42 @@ Readability.prototype = { // The commented out elements qualify as phrasing content but tend to be // removed by readability when put into paragraphs, so we ignore them here. PHRASING_ELEMS: [ - // "CANVAS", "IFRAME", "SVG", "VIDEO", - "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", - "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", - "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", - "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", - "SUP", "TEXTAREA", "TIME", "VAR", "WBR" + // "CANVAS", "IFRAME", "SVG", "VIDEO", + "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", + "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", + "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", + "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", + "SUP", "TEXTAREA", "TIME", "VAR", "WBR" ], // These are the classes that readability sets itself. CLASSES_TO_PRESERVE: [ "page" ], + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + /** * Run any post-process modifications to article content as necessary. * * @param Element * @return void - **/ + **/ _postProcessContent: function(articleContent) { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); - // Remove classes. - this._cleanClasses(articleContent); + this._simplifyNestedElements(articleContent); + + if (!this._keepClasses) { + // Remove classes. + this._cleanClasses(articleContent); + } }, /** @@ -178,6 +206,10 @@ Readability.prototype = { * @return void */ _removeNodes: function(nodeList, filterFn) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _removeNodes"); + } for (var i = nodeList.length - 1; i >= 0; i--) { var node = nodeList[i]; var parentNode = node.parentNode; @@ -197,8 +229,11 @@ Readability.prototype = { * @return void */ _replaceNodeTags: function(nodeList, newTagName) { - for (var i = nodeList.length - 1; i >= 0; i--) { - var node = nodeList[i]; + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _replaceNodeTags"); + } + for (const node of nodeList) { this._setNodeTag(node, newTagName); } }, @@ -218,6 +253,21 @@ Readability.prototype = { Array.prototype.forEach.call(nodeList, fn, this); }, + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + /** * Iterate over a NodeList, return true if any of the provided iterate * function calls returns true, false otherwise. @@ -284,11 +334,11 @@ Readability.prototype = { _cleanClasses: function(node) { var classesToPreserve = this._classesToPreserve; var className = (node.getAttribute("class") || "") - .split(/\s+/) - .filter(function(cls) { - return classesToPreserve.indexOf(cls) != -1; - }) - .join(" "); + .split(/\s+/) + .filter(function(cls) { + return classesToPreserve.indexOf(cls) != -1; + }) + .join(" "); if (className) { node.setAttribute("class", className); @@ -316,6 +366,7 @@ Readability.prototype = { if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; } + // Otherwise, resolve against base URI: try { return new URL(uri, baseURI).href; @@ -329,30 +380,81 @@ Readability.prototype = { this._forEachNode(links, function(link) { var href = link.getAttribute("href"); if (href) { - // Replace links with javascript: URIs with text content, since + // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.indexOf("javascript:") === 0) { - var text = this._doc.createTextNode(link.textContent); - link.parentNode.replaceChild(text, link); + // if the link only contains simple text content, it can be converted to a text node + if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + var text = this._doc.createTextNode(link.textContent); + link.parentNode.replaceChild(text, link); + } else { + // if the link has multiple children, they should all be preserved + var container = this._doc.createElement("span"); + while (link.firstChild) { + container.appendChild(link.firstChild); + } + link.parentNode.replaceChild(container, link); + } } else { link.setAttribute("href", toAbsoluteURI(href)); } } }); - var imgs = this._getAllNodesWithTag(articleContent, ["img"]); - this._forEachNode(imgs, function(img) { - var src = img.getAttribute("src"); + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + if (src) { - img.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); } }); }, + _simplifyNestedElements: function(articleContent) { + var node = articleContent; + + while (node) { + if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if (this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + var child = node.children[0]; + for (var i = 0; i < node.attributes.length; i++) { + child.setAttribute(node.attributes[i].name, node.attributes[i].value); + } + node.parentNode.replaceChild(child, node); + node = child; + continue; + } + } + + node = this._getNextNode(node); + } + }, + /** * Get the article title as an H1. * - * @return void + * @return string **/ _getArticleTitle: function() { var doc = this._doc; @@ -386,7 +488,7 @@ Readability.prototype = { // could assume it's the full title. var headings = this._concatNodeLists( doc.getElementsByTagName("h1"), - doc.getElementsByTagName("h2") + doc.getElementsByTagName("h2") ); var trimmedTitle = curTitle.trim(); var match = this._someNode(headings, function(heading) { @@ -420,12 +522,12 @@ Readability.prototype = { // the original title. var curTitleWordCount = wordCount(curTitle); if (curTitleWordCount <= 4 && - (!titleHadHierarchicalSeparators || - curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { curTitle = origTitle; - } + } - return curTitle; + return curTitle; }, /** @@ -438,28 +540,28 @@ Readability.prototype = { var doc = this._doc; // Remove all style tags in head - this._removeNodes(doc.getElementsByTagName("style")); + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); if (doc.body) { this._replaceBrs(doc.body); } - this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN"); + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); }, /** - * Finds the next element, starting from the given node, and ignoring + * Finds the next node, starting from the given node, and ignoring * whitespace in between. If the given node is an element, the same node is * returned. */ - _nextElement: function (node) { + _nextNode: function (node) { var next = node; while (next - && (next.nodeType != this.ELEMENT_NODE) - && this.REGEXPS.whitespace.test(next.textContent)) { + && (next.nodeType != this.ELEMENT_NODE) + && this.REGEXPS.whitespace.test(next.textContent)) { next = next.nextSibling; - } - return next; + } + return next; }, /** @@ -477,10 +579,10 @@ Readability.prototype = { //

block. var replaced = false; - // If we find a
chain, remove the
s until we hit another element + // If we find a
chain, remove the
s until we hit another node // or non-whitespace. This leaves behind the first
in the chain // (which will be replaced with a

later). - while ((next = this._nextElement(next)) && (next.tagName == "BR")) { + while ((next = this._nextNode(next)) && (next.tagName == "BR")) { replaced = true; var brSibling = next.nextSibling; next.parentNode.removeChild(next); @@ -498,7 +600,7 @@ Readability.prototype = { while (next) { // If we've hit another

, we're done adding children to this

. if (next.tagName == "BR") { - var nextElem = this._nextElement(next.nextSibling); + var nextElem = this._nextNode(next.nextSibling); if (nextElem && nextElem.tagName == "BR") break; } @@ -524,7 +626,7 @@ Readability.prototype = { _setNodeTag: function (node, tag) { this.log("_setNodeTag", node, tag); - if (node.__JSDOMParser__) { + if (this._docJSDOMParser) { node.localName = tag.toLowerCase(); node.tagName = tag.toUpperCase(); return node; @@ -575,7 +677,6 @@ Readability.prototype = { this._cleanConditionally(articleContent, "fieldset"); this._clean(articleContent, "object"); this._clean(articleContent, "embed"); - this._clean(articleContent, "h1"); this._clean(articleContent, "footer"); this._clean(articleContent, "link"); this._clean(articleContent, "aside"); @@ -591,25 +692,6 @@ Readability.prototype = { }); }); - // If there is only one h2 and its text content substantially equals article title, - // they are probably using it as a header and not a subheader, - // so remove it since we already extract the title separately. - var h2 = articleContent.getElementsByTagName("h2"); - if (h2.length === 1) { - var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; - if (Math.abs(lengthSimilarRate) < 0.5) { - var titlesMatch = false; - if (lengthSimilarRate > 0) { - titlesMatch = h2[0].textContent.includes(this._articleTitle); - } else { - titlesMatch = this._articleTitle.includes(h2[0].textContent); - } - if (titlesMatch) { - this._clean(articleContent, "h2"); - } - } - } - this._clean(articleContent, "iframe"); this._clean(articleContent, "input"); this._clean(articleContent, "textarea"); @@ -623,8 +705,11 @@ Readability.prototype = { this._cleanConditionally(articleContent, "ul"); this._cleanConditionally(articleContent, "div"); + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + // Remove extra paragraphs - this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { + this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { var imgCount = paragraph.getElementsByTagName("img").length; var embedCount = paragraph.getElementsByTagName("embed").length; var objectCount = paragraph.getElementsByTagName("object").length; @@ -636,7 +721,7 @@ Readability.prototype = { }); this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { - var next = this._nextElement(br.nextSibling); + var next = this._nextNode(br.nextSibling); if (next && next.tagName == "P") br.parentNode.removeChild(br); }); @@ -661,7 +746,7 @@ Readability.prototype = { * * @param Element * @return void - **/ + **/ _initializeNode: function(node) { node.readability = {"contentScore": 0}; @@ -732,6 +817,21 @@ Readability.prototype = { return node && node.nextElementSibling; }, + // compares second text to first one + // 1 = same text, 0 = completely different text + // works the way that it splits both texts into words and then finds words that are unique in second text + // the result is given by the lower length of unique parts + _textSimilarity: function(textA, textB) { + var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + if (!tokensA.length || !tokensB.length) { + return 0; + } + var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; + return 1 - distanceB; + }, + _checkByline: function(node, matchString) { if (this._articleByline) { return false; @@ -768,11 +868,11 @@ Readability.prototype = { * * @param page a document to run upon. Needs to be a full document, complete with body. * @return Element - **/ + **/ _grabArticle: function (page) { this.log("**** grabArticle ****"); var doc = this._doc; - var isPaging = (page !== null ? true: false); + var isPaging = page !== null; page = page ? page : this._doc.body; // We can't grab an article if we don't have a page! @@ -784,6 +884,7 @@ Readability.prototype = { var pageCacheHtml = page.innerHTML; while (true) { + this.log("Starting grabArticle loop"); var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); // First, node prepping. Trash nodes that look cruddy (like ones with the @@ -792,7 +893,14 @@ Readability.prototype = { var elementsToScore = []; var node = this._doc.documentElement; + let shouldRemoveTitleHeader = true; + while (node) { + + if (node.tagName === "HTML") { + this._articleLang = node.getAttribute("lang"); + } + var matchString = node.className + " " + node.id; if (!this._isProbablyVisible(node)) { @@ -807,71 +915,85 @@ Readability.prototype = { continue; } - // Remove unlikely candidates - if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString) && - !this._hasAncestorTag(node, "table") && - node.tagName !== "BODY" && - node.tagName !== "A") { - this.log("Removing unlikely candidate - " + matchString); - node = this._removeAndGetNext(node); - continue; - } - } - - // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). - if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || - node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || - node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && - this._isElementWithoutContent(node)) { + if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { + this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + shouldRemoveTitleHeader = false; node = this._removeAndGetNext(node); continue; } - if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { - elementsToScore.push(node); - } - - // Turn all divs that don't have children block level elements into p's - if (node.tagName === "DIV") { - // Put phrasing content into paragraphs. - var p = null; - var childNode = node.firstChild; - while (childNode) { - var nextSibling = childNode.nextSibling; - if (this._isPhrasingContent(childNode)) { - if (p !== null) { - p.appendChild(childNode); - } else if (!this._isWhitespace(childNode)) { - p = doc.createElement("p"); - node.replaceChild(p, childNode); - p.appendChild(childNode); - } - } else if (p !== null) { - while (p.lastChild && this._isWhitespace(p.lastChild)) { - p.removeChild(p.lastChild); - } - p = null; + // Remove unlikely candidates + if (stripUnlikelyCandidates) { + if (this.REGEXPS.unlikelyCandidates.test(matchString) && + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && + node.tagName !== "BODY" && + node.tagName !== "A") { + this.log("Removing unlikely candidate - " + matchString); + node = this._removeAndGetNext(node); + continue; } - childNode = nextSibling; + + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { + this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + } + + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || + node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || + node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && + this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; } - // Sites like http://mobile.slate.com encloses each paragraph with a DIV - // element. DIVs with only a P element inside and no text content can be - // safely converted into plain P elements to avoid confusing the scoring - // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { - var newNode = node.children[0]; - node.parentNode.replaceChild(newNode, node); - node = newNode; - elementsToScore.push(node); - } else if (!this._hasChildBlockElement(node)) { - node = this._setNodeTag(node, "P"); + if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { elementsToScore.push(node); } - } - node = this._getNextNode(node); + + // Turn all divs that don't have children block level elements into p's + if (node.tagName === "DIV") { + // Put phrasing content into paragraphs. + var p = null; + var childNode = node.firstChild; + while (childNode) { + var nextSibling = childNode.nextSibling; + if (this._isPhrasingContent(childNode)) { + if (p !== null) { + p.appendChild(childNode); + } else if (!this._isWhitespace(childNode)) { + p = doc.createElement("p"); + node.replaceChild(p, childNode); + p.appendChild(childNode); + } + } else if (p !== null) { + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + p = null; + } + childNode = nextSibling; + } + + // Sites like http://mobile.slate.com encloses each paragraph with a DIV + // element. DIVs with only a P element inside and no text content can be + // safely converted into plain P elements to avoid confusing the scoring + // algorithm with DIVs with are, in practice, paragraphs. + if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { + var newNode = node.children[0]; + node.parentNode.replaceChild(newNode, node); + node = newNode; + elementsToScore.push(node); + } else if (!this._hasChildBlockElement(node)) { + node = this._setNodeTag(node, "P"); + elementsToScore.push(node); + } + } + node = this._getNextNode(node); } /** @@ -879,7 +1001,7 @@ Readability.prototype = { * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ + **/ var candidates = []; this._forEachNode(elementsToScore, function(elementToScore) { if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") @@ -891,7 +1013,7 @@ Readability.prototype = { return; // Exclude nodes with no ancestor. - var ancestors = this._getNodeAncestors(elementToScore, 3); + var ancestors = this._getNodeAncestors(elementToScore, 5); if (ancestors.length === 0) return; @@ -968,10 +1090,9 @@ Readability.prototype = { neededToCreateTopCandidate = true; // Move everything (not just elements, also text nodes etc.) into the container // so we even include text directly in the body: - var kids = page.childNodes; - while (kids.length) { - this.log("Moving child out:", kids[0]); - topCandidate.appendChild(kids[0]); + while (page.firstChild) { + this.log("Moving child out:", page.firstChild); + topCandidate.appendChild(page.firstChild); } page.appendChild(topCandidate); @@ -1074,20 +1195,20 @@ Readability.prototype = { contentBonus += topCandidate.readability.contentScore * 0.2; if (sibling.readability && - ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { + ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { append = true; - } else if (sibling.nodeName === "P") { - var linkDensity = this._getLinkDensity(sibling); - var nodeContent = this._getInnerText(sibling); - var nodeLength = nodeContent.length; + } else if (sibling.nodeName === "P") { + var linkDensity = this._getLinkDensity(sibling); + var nodeContent = this._getInnerText(sibling); + var nodeLength = nodeContent.length; - if (nodeLength > 80 && linkDensity < 0.25) { - append = true; - } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1) { - append = true; + if (nodeLength > 80 && linkDensity < 0.25) { + append = true; + } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1) { + append = true; + } } - } } if (append) { @@ -1102,6 +1223,9 @@ Readability.prototype = { } articleContent.appendChild(sibling); + // Fetch children again to make it compatible + // with DOM parsers without live collection support. + siblings = parentOfTopCandidate.children; // siblings is a reference to the children array, and // sibling is removed from the array when we call appendChild(). // As a result, we must revisit this index since the nodes @@ -1129,9 +1253,8 @@ Readability.prototype = { var div = doc.createElement("DIV"); div.id = "readability-page-1"; div.className = "page"; - var children = articleContent.childNodes; - while (children.length) { - div.appendChild(children[0]); + while (articleContent.firstChild) { + div.appendChild(articleContent.firstChild); } articleContent.appendChild(div); } @@ -1211,12 +1334,127 @@ Readability.prototype = { return false; }, + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var metadata; + + this._forEachNode(scripts, function(jsonLdElement) { + if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org$/) + ) { + return; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return; + } + + metadata = {}; + + if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. + + var title = this._getArticleTitle(); + var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; + var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; + + if (headlineMatches && !nameMatches) { + metadata.title = parsed.headline; + } else { + metadata.title = parsed.name; + } + } else if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author) { + if (typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + metadata.byline = parsed.author + .filter(function(author) { + return author && typeof author.name === "string"; + }) + .map(function(author) { + return author.name.trim(); + }) + .join(", "); + } + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + return; + } catch (err) { + this.log(err.message); + } + } + }); + return metadata ? metadata : {}; + }, + /** * Attempts to get excerpt and byline metadata for the article. * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * * @return Object with optional "excerpt" and "byline" properties */ - _getArticleMetadata: function() { + _getArticleMetadata: function(jsonld) { var metadata = {}; var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); @@ -1241,13 +1479,11 @@ Readability.prototype = { if (elementProperty) { matches = elementProperty.match(propertyPattern); if (matches) { - for (var i = matches.length - 1; i >= 0; i--) { - // Convert to lowercase, and remove any whitespace - // so we can match below. - name = matches[i].toLowerCase().replace(/\s/g, ""); - // multiple authors - values[name] = content.trim(); - } + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[0].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); } } if (!matches && elementName && namePattern.test(elementName)) { @@ -1262,50 +1498,156 @@ Readability.prototype = { }); // get title - metadata.title = values["dc:title"] || - values["dcterm:title"] || - values["og:title"] || - values["weibo:article:title"] || - values["weibo:webpage:title"] || - values["title"] || - values["twitter:title"]; + metadata.title = jsonld.title || + values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"]; if (!metadata.title) { metadata.title = this._getArticleTitle(); } // get author - metadata.byline = values["dc:creator"] || - values["dcterm:creator"] || - values["author"]; + metadata.byline = jsonld.byline || + values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; // get description - metadata.excerpt = values["dc:description"] || - values["dcterm:description"] || - values["og:description"] || - values["weibo:article:description"] || - values["weibo:webpage:description"] || - values["description"] || - values["twitter:description"]; + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; // get site name - metadata.siteName = values["og:site_name"]; + metadata.siteName = jsonld.siteName || + values["og:site_name"]; + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); return metadata; }, + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all

that does not contain an image or picture, create one and place it inside the figure - //see the nytimes-3 testcase for an example - var img = this._doc.createElement("img"); - img.setAttribute(copyTo, attr.value); - elem.appendChild(img); - } + } + + // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) + // it will be too small, therefore it might be placeholder image. + if (srcCouldBeRemoved) { + var b64starts = elem.src.search(/base64\s*/i) + 7; + var b64length = elem.src.length - b64starts; + if (b64length < 133) { + elem.removeAttribute("src"); + } + } + } + + // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 + if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) { + return; + } + + for (var j = 0; j < elem.attributes.length; j++) { + attr = elem.attributes[j]; + if (attr.name === "src" || attr.name === "srcset" || attr.name === "alt") { + continue; + } + var copyTo = null; + if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { + copyTo = "srcset"; + } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { + copyTo = "src"; + } + if (copyTo) { + //if this is an img or picture, set the attribute directly + if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { + elem.setAttribute(copyTo, attr.value); + } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) { + //if the item is a
that does not contain an image or picture, create one and place it inside the figure + //see the nytimes-3 testcase for an example + var img = this._doc.createElement("img"); + img.setAttribute(copyTo, attr.value); + elem.appendChild(img); } } } }); }, + _getTextDensity: function(e, tags) { + var textLength = this._getInnerText(e, true).length; + if (textLength === 0) { + return 0; + } + var childrenLength = 0; + var children = this._getAllNodesWithTag(e, tags); + this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length); + return childrenLength / textLength; + }, + /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. @@ -1662,19 +2054,25 @@ Readability.prototype = { if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) return; - var isList = tag === "ul" || tag === "ol"; - // Gather counts for other typical elements embedded within. // Traverse backwards so we can remove nodes at the same time // without effecting the traversal. // // TODO: Consider taking into account original contentScore here. - this._removeNodes(e.getElementsByTagName(tag), function(node) { + this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) { // First check if this node IS data table, in which case don't remove it. var isDataTable = function(t) { return t._readabilityDataTable; }; + var isList = tag === "ul" || tag === "ol"; + if (!isList) { + var listLength = 0; + var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]); + this._forEachNode(listNodes, (list) => listLength += this._getInnerText(list).length); + isList = listLength / this._getInnerText(node).length > 0.9; + } + if (tag === "table" && isDataTable(node)) { return false; } @@ -1684,11 +2082,16 @@ Readability.prototype = { return false; } + if (this._hasAncestorTag(node, "code")) { + return false; + } + var weight = this._getClassWeight(node); - var contentScore = 0; this.log("Cleaning Conditionally", node); + var contentScore = 0; + if (weight + contentScore < 0) { return true; } @@ -1701,12 +2104,10 @@ Readability.prototype = { var img = node.getElementsByTagName("img").length; var li = node.getElementsByTagName("li").length - 100; var input = node.getElementsByTagName("input").length; + var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]); var embedCount = 0; - var embeds = this._concatNodeLists( - node.getElementsByTagName("object"), - node.getElementsByTagName("embed"), - node.getElementsByTagName("iframe")); + var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]); for (var i = 0; i < embeds.length; i++) { // If this embed has attribute that matches video regex, don't delete it. @@ -1728,13 +2129,13 @@ Readability.prototype = { var contentLength = this._getInnerText(node).length; var haveToRemove = - (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || - (!isList && li > p) || - (input > Math.floor(p/3)) || - (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || - (!isList && weight < 25 && linkDensity > 0.2) || - (weight >= 25 && linkDensity > 0.5) || - ((embedCount === 1 && contentLength < 75) || embedCount > 1); + (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || + (!isList && li > p) || + (input > Math.floor(p/3)) || + (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || + (!isList && weight < 25 && linkDensity > 0.2) || + (weight >= 25 && linkDensity > 0.5) || + ((embedCount === 1 && contentLength < 75) || embedCount > 1); return haveToRemove; } return false; @@ -1761,17 +2162,36 @@ Readability.prototype = { }, /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. + * Clean out spurious headers from an Element. * * @param Element * @return void - **/ + **/ _cleanHeaders: function(e) { - for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { - this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) { - return this._getClassWeight(header) < 0; - }); + let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); + this._removeNodes(headingNodes, function(node) { + let shouldRemove = this._getClassWeight(node) < 0; + if (shouldRemove) { + this.log("Removing header with low class weight:", node); + } + return shouldRemove; + }); + }, + + /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param Element the node to check. + * @return boolean indicating whether this is a title-like header. + */ + _headerDuplicatesTitle: function(node) { + if (node.tagName != "H1" && node.tagName != "H2") { + return false; } + var heading = this._getInnerText(node, false); + this.log("Evaluating similarity of header:", heading, this._articleTitle); + return this._textSimilarity(this._articleTitle, heading) > 0.75; }, _flagIsActive: function(flag) { @@ -1783,7 +2203,11 @@ Readability.prototype = { }, _isProbablyVisible: function(node) { - return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden"); + // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. + return (!node.style || node.style.display != "none") + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); }, /** @@ -1807,12 +2231,18 @@ Readability.prototype = { } } + // Unwrap image from noscript + this._unwrapNoscriptImages(this._doc); + + // Extract JSON-LD metadata before removing scripts + var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); + // Remove script tags from the document. this._removeScripts(this._doc); this._prepDocument(); - var metadata = this._getArticleMetadata(); + var metadata = this._getArticleMetadata(jsonLd); this._articleTitle = metadata.title; var articleContent = this._grabArticle(); @@ -1838,7 +2268,8 @@ Readability.prototype = { title: this._articleTitle, byline: metadata.byline || this._articleByline, dir: this._articleDir, - content: articleContent.innerHTML, + lang: this._articleLang, + content: this._serializer(articleContent), textContent: textContent, length: textContent.length, excerpt: metadata.excerpt,