This is page 25 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│ ├── __init__.py
│ ├── advanced_agent_flows_using_unified_memory_system_demo.py
│ ├── advanced_extraction_demo.py
│ ├── advanced_unified_memory_system_demo.py
│ ├── advanced_vector_search_demo.py
│ ├── analytics_reporting_demo.py
│ ├── audio_transcription_demo.py
│ ├── basic_completion_demo.py
│ ├── cache_demo.py
│ ├── claude_integration_demo.py
│ ├── compare_synthesize_demo.py
│ ├── cost_optimization.py
│ ├── data
│ │ ├── sample_event.txt
│ │ ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│ │ └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│ ├── docstring_refiner_demo.py
│ ├── document_conversion_and_processing_demo.py
│ ├── entity_relation_graph_demo.py
│ ├── filesystem_operations_demo.py
│ ├── grok_integration_demo.py
│ ├── local_text_tools_demo.py
│ ├── marqo_fused_search_demo.py
│ ├── measure_model_speeds.py
│ ├── meta_api_demo.py
│ ├── multi_provider_demo.py
│ ├── ollama_integration_demo.py
│ ├── prompt_templates_demo.py
│ ├── python_sandbox_demo.py
│ ├── rag_example.py
│ ├── research_workflow_demo.py
│ ├── sample
│ │ ├── article.txt
│ │ ├── backprop_paper.pdf
│ │ ├── buffett.pdf
│ │ ├── contract_link.txt
│ │ ├── legal_contract.txt
│ │ ├── medical_case.txt
│ │ ├── northwind.db
│ │ ├── research_paper.txt
│ │ ├── sample_data.json
│ │ └── text_classification_samples
│ │ ├── email_classification.txt
│ │ ├── news_samples.txt
│ │ ├── product_reviews.txt
│ │ └── support_tickets.txt
│ ├── sample_docs
│ │ └── downloaded
│ │ └── attention_is_all_you_need.pdf
│ ├── sentiment_analysis_demo.py
│ ├── simple_completion_demo.py
│ ├── single_shot_synthesis_demo.py
│ ├── smart_browser_demo.py
│ ├── sql_database_demo.py
│ ├── sse_client_demo.py
│ ├── test_code_extraction.py
│ ├── test_content_detection.py
│ ├── test_ollama.py
│ ├── text_classification_demo.py
│ ├── text_redline_demo.py
│ ├── tool_composition_examples.py
│ ├── tournament_code_demo.py
│ ├── tournament_text_demo.py
│ ├── unified_memory_system_demo.py
│ ├── vector_search_demo.py
│ ├── web_automation_instruction_packs.py
│ └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│ └── smart_browser_internal
│ ├── locator_cache.db
│ ├── readability.js
│ └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integration
│ │ ├── __init__.py
│ │ └── test_server.py
│ ├── manual
│ │ ├── test_extraction_advanced.py
│ │ └── test_extraction.py
│ └── unit
│ ├── __init__.py
│ ├── test_cache.py
│ ├── test_providers.py
│ └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── commands.py
│ │ ├── helpers.py
│ │ └── typer_cli.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── completion_client.py
│ │ └── rag_client.py
│ ├── config
│ │ └── examples
│ │ └── filesystem_config.yaml
│ ├── config.py
│ ├── constants.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── evaluation
│ │ │ ├── base.py
│ │ │ └── evaluators.py
│ │ ├── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base.py
│ │ │ ├── deepseek.py
│ │ │ ├── gemini.py
│ │ │ ├── grok.py
│ │ │ ├── ollama.py
│ │ │ ├── openai.py
│ │ │ └── openrouter.py
│ │ ├── server.py
│ │ ├── state_store.py
│ │ ├── tournaments
│ │ │ ├── manager.py
│ │ │ ├── tasks.py
│ │ │ └── utils.py
│ │ └── ums_api
│ │ ├── __init__.py
│ │ ├── ums_database.py
│ │ ├── ums_endpoints.py
│ │ ├── ums_models.py
│ │ └── ums_services.py
│ ├── exceptions.py
│ ├── graceful_shutdown.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── analytics
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ └── reporting.py
│ │ ├── cache
│ │ │ ├── __init__.py
│ │ │ ├── cache_service.py
│ │ │ ├── persistence.py
│ │ │ ├── strategies.py
│ │ │ └── utils.py
│ │ ├── cache.py
│ │ ├── document.py
│ │ ├── knowledge_base
│ │ │ ├── __init__.py
│ │ │ ├── feedback.py
│ │ │ ├── manager.py
│ │ │ ├── rag_engine.py
│ │ │ ├── retriever.py
│ │ │ └── utils.py
│ │ ├── prompts
│ │ │ ├── __init__.py
│ │ │ ├── repository.py
│ │ │ └── templates.py
│ │ ├── prompts.py
│ │ └── vector
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── vector_service.py
│ ├── tool_token_counter.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── audio_transcription.py
│ │ ├── base.py
│ │ ├── completion.py
│ │ ├── docstring_refiner.py
│ │ ├── document_conversion_and_processing.py
│ │ ├── enhanced-ums-lookbook.html
│ │ ├── entity_relation_graph.py
│ │ ├── excel_spreadsheet_automation.py
│ │ ├── extraction.py
│ │ ├── filesystem.py
│ │ ├── html_to_markdown.py
│ │ ├── local_text_tools.py
│ │ ├── marqo_fused_search.py
│ │ ├── meta_api_tool.py
│ │ ├── ocr_tools.py
│ │ ├── optimization.py
│ │ ├── provider.py
│ │ ├── pyodide_boot_template.html
│ │ ├── python_sandbox.py
│ │ ├── rag.py
│ │ ├── redline-compiled.css
│ │ ├── sentiment_analysis.py
│ │ ├── single_shot_synthesis.py
│ │ ├── smart_browser.py
│ │ ├── sql_databases.py
│ │ ├── text_classification.py
│ │ ├── text_redline_tools.py
│ │ ├── tournament.py
│ │ ├── ums_explorer.html
│ │ └── unified_memory_system.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── async_utils.py
│ │ ├── display.py
│ │ ├── logging
│ │ │ ├── __init__.py
│ │ │ ├── console.py
│ │ │ ├── emojis.py
│ │ │ ├── formatter.py
│ │ │ ├── logger.py
│ │ │ ├── panels.py
│ │ │ ├── progress.py
│ │ │ └── themes.py
│ │ ├── parse_yaml.py
│ │ ├── parsing.py
│ │ ├── security.py
│ │ └── text.py
│ └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/storage/smart_browser_internal/readability.js:
--------------------------------------------------------------------------------
```javascript
1 | /*
2 | * Copyright (c) 2010 Arc90 Inc
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | /*
18 | * This code is heavily based on Arc90's readability.js (1.7.1) script
19 | * available at: http://code.google.com/p/arc90labs-readability
20 | */
21 |
22 | /**
23 | * Public constructor.
24 | * @param {HTMLDocument} doc The document to parse.
25 | * @param {Object} options The options object.
26 | */
27 | function Readability(doc, options) {
28 | // In some older versions, people passed a URI as the first argument. Cope:
29 | if (options && options.documentElement) {
30 | doc = options;
31 | options = arguments[2];
32 | } else if (!doc || !doc.documentElement) {
33 | throw new Error("First argument to Readability constructor should be a document object.");
34 | }
35 | options = options || {};
36 |
37 | this._doc = doc;
38 | this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
39 | this._articleTitle = null;
40 | this._articleByline = null;
41 | this._articleDir = null;
42 | this._articleSiteName = null;
43 | this._attempts = [];
44 |
45 | // Configurable options
46 | this._debug = !!options.debug;
47 | this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
48 | this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
49 | this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
50 | this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
51 | this._keepClasses = !!options.keepClasses;
52 | this._serializer = options.serializer || function(el) {
53 | return el.innerHTML;
54 | };
55 | this._disableJSONLD = !!options.disableJSONLD;
56 | this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
57 |
58 | // Start with all flags set
59 | this._flags = this.FLAG_STRIP_UNLIKELYS |
60 | this.FLAG_WEIGHT_CLASSES |
61 | this.FLAG_CLEAN_CONDITIONALLY;
62 |
63 |
64 | // Control whether log messages are sent to the console
65 | if (this._debug) {
66 | let logNode = function(node) {
67 | if (node.nodeType == node.TEXT_NODE) {
68 | return `${node.nodeName} ("${node.textContent}")`;
69 | }
70 | let attrPairs = Array.from(node.attributes || [], function(attr) {
71 | return `${attr.name}="${attr.value}"`;
72 | }).join(" ");
73 | return `<${node.localName} ${attrPairs}>`;
74 | };
75 | this.log = function () {
76 | if (typeof console !== "undefined") {
77 | let args = Array.from(arguments, arg => {
78 | if (arg && arg.nodeType == this.ELEMENT_NODE) {
79 | return logNode(arg);
80 | }
81 | return arg;
82 | });
83 | args.unshift("Reader: (Readability)");
84 | console.log.apply(console, args);
85 | } else if (typeof dump !== "undefined") {
86 | /* global dump */
87 | var msg = Array.prototype.map.call(arguments, function(x) {
88 | return (x && x.nodeName) ? logNode(x) : x;
89 | }).join(" ");
90 | dump("Reader: (Readability) " + msg + "\n");
91 | }
92 | };
93 | } else {
94 | this.log = function () {};
95 | }
96 | }
97 |
98 | Readability.prototype = {
99 | FLAG_STRIP_UNLIKELYS: 0x1,
100 | FLAG_WEIGHT_CLASSES: 0x2,
101 | FLAG_CLEAN_CONDITIONALLY: 0x4,
102 |
103 | // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
104 | ELEMENT_NODE: 1,
105 | TEXT_NODE: 3,
106 |
107 | // Max number of nodes supported by this parser. Default: 0 (no limit)
108 | DEFAULT_MAX_ELEMS_TO_PARSE: 0,
109 |
110 | // The number of top candidates to consider when analysing how
111 | // tight the competition is among candidates.
112 | DEFAULT_N_TOP_CANDIDATES: 5,
113 |
114 | // Element tags to score by default.
115 | DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
116 |
117 | // The default number of chars an article must have in order to return a result
118 | DEFAULT_CHAR_THRESHOLD: 500,
119 |
120 | // All of the regular expressions in use within readability.
121 | // Defined up here so we don't instantiate them repeatedly in loops.
122 | REGEXPS: {
123 | // NOTE: These two regular expressions are duplicated in
124 | // Readability-readerable.js. Please keep both copies in sync.
125 | unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
126 | okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
127 |
128 | positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
129 | negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
130 | extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
131 | byline: /byline|author|dateline|writtenby|p-author/i,
132 | replaceFonts: /<(\/?)font[^>]*>/gi,
133 | normalize: /\s{2,}/g,
134 | videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
135 | shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
136 | nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
137 | prevLink: /(prev|earl|old|new|<|«)/i,
138 | tokenize: /\W+/g,
139 | whitespace: /^\s*$/,
140 | hasContent: /\S$/,
141 | hashUrl: /^#.+/,
142 | srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
143 | b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
144 | // Commas as used in Latin, Sindhi, Chinese and various other scripts.
145 | // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
146 | commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
147 | // See: https://schema.org/Article
148 | jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
149 | },
150 |
151 | UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
152 |
153 | DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),
154 |
155 | ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
156 |
157 | PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
158 |
159 | DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
160 |
161 | // The commented out elements qualify as phrasing content but tend to be
162 | // removed by readability when put into paragraphs, so we ignore them here.
163 | PHRASING_ELEMS: [
164 | // "CANVAS", "IFRAME", "SVG", "VIDEO",
165 | "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
166 | "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
167 | "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
168 | "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
169 | "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
170 | ],
171 |
172 | // These are the classes that readability sets itself.
173 | CLASSES_TO_PRESERVE: [ "page" ],
174 |
175 | // These are the list of HTML entities that need to be escaped.
176 | HTML_ESCAPE_MAP: {
177 | "lt": "<",
178 | "gt": ">",
179 | "amp": "&",
180 | "quot": '"',
181 | "apos": "'",
182 | },
183 |
184 | /**
185 | * Run any post-process modifications to article content as necessary.
186 | *
187 | * @param Element
188 | * @return void
189 | **/
190 | _postProcessContent: function(articleContent) {
191 | // Readability cannot open relative uris so we convert them to absolute uris.
192 | this._fixRelativeUris(articleContent);
193 |
194 | this._simplifyNestedElements(articleContent);
195 |
196 | if (!this._keepClasses) {
197 | // Remove classes.
198 | this._cleanClasses(articleContent);
199 | }
200 | },
201 |
202 | /**
203 | * Iterates over a NodeList, calls `filterFn` for each node and removes node
204 | * if function returned `true`.
205 | *
206 | * If function is not passed, removes all the nodes in node list.
207 | *
208 | * @param NodeList nodeList The nodes to operate on
209 | * @param Function filterFn the function to use as a filter
210 | * @return void
211 | */
212 | _removeNodes: function(nodeList, filterFn) {
213 | // Avoid ever operating on live node lists.
214 | if (this._docJSDOMParser && nodeList._isLiveNodeList) {
215 | throw new Error("Do not pass live node lists to _removeNodes");
216 | }
217 | for (var i = nodeList.length - 1; i >= 0; i--) {
218 | var node = nodeList[i];
219 | var parentNode = node.parentNode;
220 | if (parentNode) {
221 | if (!filterFn || filterFn.call(this, node, i, nodeList)) {
222 | parentNode.removeChild(node);
223 | }
224 | }
225 | }
226 | },
227 |
228 | /**
229 | * Iterates over a NodeList, and calls _setNodeTag for each node.
230 | *
231 | * @param NodeList nodeList The nodes to operate on
232 | * @param String newTagName the new tag name to use
233 | * @return void
234 | */
235 | _replaceNodeTags: function(nodeList, newTagName) {
236 | // Avoid ever operating on live node lists.
237 | if (this._docJSDOMParser && nodeList._isLiveNodeList) {
238 | throw new Error("Do not pass live node lists to _replaceNodeTags");
239 | }
240 | for (const node of nodeList) {
241 | this._setNodeTag(node, newTagName);
242 | }
243 | },
244 |
245 | /**
246 | * Iterate over a NodeList, which doesn't natively fully implement the Array
247 | * interface.
248 | *
249 | * For convenience, the current object context is applied to the provided
250 | * iterate function.
251 | *
252 | * @param NodeList nodeList The NodeList.
253 | * @param Function fn The iterate function.
254 | * @return void
255 | */
256 | _forEachNode: function(nodeList, fn) {
257 | Array.prototype.forEach.call(nodeList, fn, this);
258 | },
259 |
260 | /**
261 | * Iterate over a NodeList, and return the first node that passes
262 | * the supplied test function
263 | *
264 | * For convenience, the current object context is applied to the provided
265 | * test function.
266 | *
267 | * @param NodeList nodeList The NodeList.
268 | * @param Function fn The test function.
269 | * @return void
270 | */
271 | _findNode: function(nodeList, fn) {
272 | return Array.prototype.find.call(nodeList, fn, this);
273 | },
274 |
275 | /**
276 | * Iterate over a NodeList, return true if any of the provided iterate
277 | * function calls returns true, false otherwise.
278 | *
279 | * For convenience, the current object context is applied to the
280 | * provided iterate function.
281 | *
282 | * @param NodeList nodeList The NodeList.
283 | * @param Function fn The iterate function.
284 | * @return Boolean
285 | */
286 | _someNode: function(nodeList, fn) {
287 | return Array.prototype.some.call(nodeList, fn, this);
288 | },
289 |
290 | /**
291 | * Iterate over a NodeList, return true if all of the provided iterate
292 | * function calls return true, false otherwise.
293 | *
294 | * For convenience, the current object context is applied to the
295 | * provided iterate function.
296 | *
297 | * @param NodeList nodeList The NodeList.
298 | * @param Function fn The iterate function.
299 | * @return Boolean
300 | */
301 | _everyNode: function(nodeList, fn) {
302 | return Array.prototype.every.call(nodeList, fn, this);
303 | },
304 |
305 | /**
306 | * Concat all nodelists passed as arguments.
307 | *
308 | * @return ...NodeList
309 | * @return Array
310 | */
311 | _concatNodeLists: function() {
312 | var slice = Array.prototype.slice;
313 | var args = slice.call(arguments);
314 | var nodeLists = args.map(function(list) {
315 | return slice.call(list);
316 | });
317 | return Array.prototype.concat.apply([], nodeLists);
318 | },
319 |
320 | _getAllNodesWithTag: function(node, tagNames) {
321 | if (node.querySelectorAll) {
322 | return node.querySelectorAll(tagNames.join(","));
323 | }
324 | return [].concat.apply([], tagNames.map(function(tag) {
325 | var collection = node.getElementsByTagName(tag);
326 | return Array.isArray(collection) ? collection : Array.from(collection);
327 | }));
328 | },
329 |
330 | /**
331 | * Removes the class="" attribute from every element in the given
332 | * subtree, except those that match CLASSES_TO_PRESERVE and
333 | * the classesToPreserve array from the options object.
334 | *
335 | * @param Element
336 | * @return void
337 | */
338 | _cleanClasses: function(node) {
339 | var classesToPreserve = this._classesToPreserve;
340 | var className = (node.getAttribute("class") || "")
341 | .split(/\s+/)
342 | .filter(function(cls) {
343 | return classesToPreserve.indexOf(cls) != -1;
344 | })
345 | .join(" ");
346 |
347 | if (className) {
348 | node.setAttribute("class", className);
349 | } else {
350 | node.removeAttribute("class");
351 | }
352 |
353 | for (node = node.firstElementChild; node; node = node.nextElementSibling) {
354 | this._cleanClasses(node);
355 | }
356 | },
357 |
358 | /**
359 | * Converts each <a> and <img> uri in the given element to an absolute URI,
360 | * ignoring #ref URIs.
361 | *
362 | * @param Element
363 | * @return void
364 | */
365 | _fixRelativeUris: function(articleContent) {
366 | var baseURI = this._doc.baseURI;
367 | var documentURI = this._doc.documentURI;
368 | function toAbsoluteURI(uri) {
369 | // Leave hash links alone if the base URI matches the document URI:
370 | if (baseURI == documentURI && uri.charAt(0) == "#") {
371 | return uri;
372 | }
373 |
374 | // Otherwise, resolve against base URI:
375 | try {
376 | return new URL(uri, baseURI).href;
377 | } catch (ex) {
378 | // Something went wrong, just return the original:
379 | }
380 | return uri;
381 | }
382 |
383 | var links = this._getAllNodesWithTag(articleContent, ["a"]);
384 | this._forEachNode(links, function(link) {
385 | var href = link.getAttribute("href");
386 | if (href) {
387 | // Remove links with javascript: URIs, since
388 | // they won't work after scripts have been removed from the page.
389 | if (href.indexOf("javascript:") === 0) {
390 | // if the link only contains simple text content, it can be converted to a text node
391 | if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
392 | var text = this._doc.createTextNode(link.textContent);
393 | link.parentNode.replaceChild(text, link);
394 | } else {
395 | // if the link has multiple children, they should all be preserved
396 | var container = this._doc.createElement("span");
397 | while (link.firstChild) {
398 | container.appendChild(link.firstChild);
399 | }
400 | link.parentNode.replaceChild(container, link);
401 | }
402 | } else {
403 | link.setAttribute("href", toAbsoluteURI(href));
404 | }
405 | }
406 | });
407 |
408 | var medias = this._getAllNodesWithTag(articleContent, [
409 | "img", "picture", "figure", "video", "audio", "source"
410 | ]);
411 |
412 | this._forEachNode(medias, function(media) {
413 | var src = media.getAttribute("src");
414 | var poster = media.getAttribute("poster");
415 | var srcset = media.getAttribute("srcset");
416 |
417 | if (src) {
418 | media.setAttribute("src", toAbsoluteURI(src));
419 | }
420 |
421 | if (poster) {
422 | media.setAttribute("poster", toAbsoluteURI(poster));
423 | }
424 |
425 | if (srcset) {
426 | var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
427 | return toAbsoluteURI(p1) + (p2 || "") + p3;
428 | });
429 |
430 | media.setAttribute("srcset", newSrcset);
431 | }
432 | });
433 | },
434 |
435 | _simplifyNestedElements: function(articleContent) {
436 | var node = articleContent;
437 |
438 | while (node) {
439 | if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
440 | if (this._isElementWithoutContent(node)) {
441 | node = this._removeAndGetNext(node);
442 | continue;
443 | } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
444 | var child = node.children[0];
445 | for (var i = 0; i < node.attributes.length; i++) {
446 | child.setAttribute(node.attributes[i].name, node.attributes[i].value);
447 | }
448 | node.parentNode.replaceChild(child, node);
449 | node = child;
450 | continue;
451 | }
452 | }
453 |
454 | node = this._getNextNode(node);
455 | }
456 | },
457 |
458 | /**
459 | * Get the article title as an H1.
460 | *
461 | * @return string
462 | **/
463 | _getArticleTitle: function() {
464 | var doc = this._doc;
465 | var curTitle = "";
466 | var origTitle = "";
467 |
468 | try {
469 | curTitle = origTitle = doc.title.trim();
470 |
471 | // If they had an element with id "title" in their HTML
472 | if (typeof curTitle !== "string")
473 | curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
474 | } catch (e) {/* ignore exceptions setting the title. */}
475 |
476 | var titleHadHierarchicalSeparators = false;
477 | function wordCount(str) {
478 | return str.split(/\s+/).length;
479 | }
480 |
481 | // If there's a separator in the title, first remove the final part
482 | if ((/ [\|\-\\\/>»] /).test(curTitle)) {
483 | titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
484 | curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
485 |
486 | // If the resulting title is too short (3 words or fewer), remove
487 | // the first part instead:
488 | if (wordCount(curTitle) < 3)
489 | curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
490 | } else if (curTitle.indexOf(": ") !== -1) {
491 | // Check if we have an heading containing this exact string, so we
492 | // could assume it's the full title.
493 | var headings = this._concatNodeLists(
494 | doc.getElementsByTagName("h1"),
495 | doc.getElementsByTagName("h2")
496 | );
497 | var trimmedTitle = curTitle.trim();
498 | var match = this._someNode(headings, function(heading) {
499 | return heading.textContent.trim() === trimmedTitle;
500 | });
501 |
502 | // If we don't, let's extract the title out of the original title string.
503 | if (!match) {
504 | curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
505 |
506 | // If the title is now too short, try the first colon instead:
507 | if (wordCount(curTitle) < 3) {
508 | curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
509 | // But if we have too many words before the colon there's something weird
510 | // with the titles and the H tags so let's just use the original title instead
511 | } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
512 | curTitle = origTitle;
513 | }
514 | }
515 | } else if (curTitle.length > 150 || curTitle.length < 15) {
516 | var hOnes = doc.getElementsByTagName("h1");
517 |
518 | if (hOnes.length === 1)
519 | curTitle = this._getInnerText(hOnes[0]);
520 | }
521 |
522 | curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
523 | // If we now have 4 words or fewer as our title, and either no
524 | // 'hierarchical' separators (\, /, > or ») were found in the original
525 | // title or we decreased the number of words by more than 1 word, use
526 | // the original title.
527 | var curTitleWordCount = wordCount(curTitle);
528 | if (curTitleWordCount <= 4 &&
529 | (!titleHadHierarchicalSeparators ||
530 | curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
531 | curTitle = origTitle;
532 | }
533 |
534 | return curTitle;
535 | },
536 |
537 | /**
538 | * Prepare the HTML document for readability to scrape it.
539 | * This includes things like stripping javascript, CSS, and handling terrible markup.
540 | *
541 | * @return void
542 | **/
543 | _prepDocument: function() {
544 | var doc = this._doc;
545 |
546 | // Remove all style tags in head
547 | this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
548 |
549 | if (doc.body) {
550 | this._replaceBrs(doc.body);
551 | }
552 |
553 | this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
554 | },
555 |
556 | /**
557 | * Finds the next node, starting from the given node, and ignoring
558 | * whitespace in between. If the given node is an element, the same node is
559 | * returned.
560 | */
561 | _nextNode: function (node) {
562 | var next = node;
563 | while (next
564 | && (next.nodeType != this.ELEMENT_NODE)
565 | && this.REGEXPS.whitespace.test(next.textContent)) {
566 | next = next.nextSibling;
567 | }
568 | return next;
569 | },
570 |
571 | /**
572 | * Replaces 2 or more successive <br> elements with a single <p>.
573 | * Whitespace between <br> elements are ignored. For example:
574 | * <div>foo<br>bar<br> <br><br>abc</div>
575 | * will become:
576 | * <div>foo<br>bar<p>abc</p></div>
577 | */
578 | _replaceBrs: function (elem) {
579 | this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
580 | var next = br.nextSibling;
581 |
582 | // Whether 2 or more <br> elements have been found and replaced with a
583 | // <p> block.
584 | var replaced = false;
585 |
586 | // If we find a <br> chain, remove the <br>s until we hit another node
587 | // or non-whitespace. This leaves behind the first <br> in the chain
588 | // (which will be replaced with a <p> later).
589 | while ((next = this._nextNode(next)) && (next.tagName == "BR")) {
590 | replaced = true;
591 | var brSibling = next.nextSibling;
592 | next.parentNode.removeChild(next);
593 | next = brSibling;
594 | }
595 |
596 | // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
597 | // all sibling nodes as children of the <p> until we hit another <br>
598 | // chain.
599 | if (replaced) {
600 | var p = this._doc.createElement("p");
601 | br.parentNode.replaceChild(p, br);
602 |
603 | next = p.nextSibling;
604 | while (next) {
605 | // If we've hit another <br><br>, we're done adding children to this <p>.
606 | if (next.tagName == "BR") {
607 | var nextElem = this._nextNode(next.nextSibling);
608 | if (nextElem && nextElem.tagName == "BR")
609 | break;
610 | }
611 |
612 | if (!this._isPhrasingContent(next))
613 | break;
614 |
615 | // Otherwise, make this node a child of the new <p>.
616 | var sibling = next.nextSibling;
617 | p.appendChild(next);
618 | next = sibling;
619 | }
620 |
621 | while (p.lastChild && this._isWhitespace(p.lastChild)) {
622 | p.removeChild(p.lastChild);
623 | }
624 |
625 | if (p.parentNode.tagName === "P")
626 | this._setNodeTag(p.parentNode, "DIV");
627 | }
628 | });
629 | },
630 |
631 | _setNodeTag: function (node, tag) {
632 | this.log("_setNodeTag", node, tag);
633 | if (this._docJSDOMParser) {
634 | node.localName = tag.toLowerCase();
635 | node.tagName = tag.toUpperCase();
636 | return node;
637 | }
638 |
639 | var replacement = node.ownerDocument.createElement(tag);
640 | while (node.firstChild) {
641 | replacement.appendChild(node.firstChild);
642 | }
643 | node.parentNode.replaceChild(replacement, node);
644 | if (node.readability)
645 | replacement.readability = node.readability;
646 |
647 | for (var i = 0; i < node.attributes.length; i++) {
648 | try {
649 | replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
650 | } catch (ex) {
651 | /* it's possible for setAttribute() to throw if the attribute name
652 | * isn't a valid XML Name. Such attributes can however be parsed from
653 | * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
654 | * so we can hit them here and then throw. We don't care about such
655 | * attributes so we ignore them.
656 | */
657 | }
658 | }
659 | return replacement;
660 | },
661 |
662 | /**
663 | * Prepare the article node for display. Clean out any inline styles,
664 | * iframes, forms, strip extraneous <p> tags, etc.
665 | *
666 | * @param Element
667 | * @return void
668 | **/
669 | _prepArticle: function(articleContent) {
670 | this._cleanStyles(articleContent);
671 |
672 | // Check for data tables before we continue, to avoid removing items in
673 | // those tables, which will often be isolated even though they're
674 | // visually linked to other content-ful elements (text, images, etc.).
675 | this._markDataTables(articleContent);
676 |
677 | this._fixLazyImages(articleContent);
678 |
679 | // Clean out junk from the article content
680 | this._cleanConditionally(articleContent, "form");
681 | this._cleanConditionally(articleContent, "fieldset");
682 | this._clean(articleContent, "object");
683 | this._clean(articleContent, "embed");
684 | this._clean(articleContent, "footer");
685 | this._clean(articleContent, "link");
686 | this._clean(articleContent, "aside");
687 |
688 | // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
689 | // which means we don't remove the top candidates even they have "share".
690 |
691 | var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
692 |
693 | this._forEachNode(articleContent.children, function (topCandidate) {
694 | this._cleanMatchedNodes(topCandidate, function (node, matchString) {
695 | return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
696 | });
697 | });
698 |
699 | this._clean(articleContent, "iframe");
700 | this._clean(articleContent, "input");
701 | this._clean(articleContent, "textarea");
702 | this._clean(articleContent, "select");
703 | this._clean(articleContent, "button");
704 | this._cleanHeaders(articleContent);
705 |
706 | // Do these last as the previous stuff may have removed junk
707 | // that will affect these
708 | this._cleanConditionally(articleContent, "table");
709 | this._cleanConditionally(articleContent, "ul");
710 | this._cleanConditionally(articleContent, "div");
711 |
712 | // replace H1 with H2 as H1 should be only title that is displayed separately
713 | this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");
714 |
715 | // Remove extra paragraphs
716 | this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
717 | var imgCount = paragraph.getElementsByTagName("img").length;
718 | var embedCount = paragraph.getElementsByTagName("embed").length;
719 | var objectCount = paragraph.getElementsByTagName("object").length;
720 | // At this point, nasty iframes have been removed, only remain embedded video ones.
721 | var iframeCount = paragraph.getElementsByTagName("iframe").length;
722 | var totalCount = imgCount + embedCount + objectCount + iframeCount;
723 |
724 | return totalCount === 0 && !this._getInnerText(paragraph, false);
725 | });
726 |
727 | this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
728 | var next = this._nextNode(br.nextSibling);
729 | if (next && next.tagName == "P")
730 | br.parentNode.removeChild(br);
731 | });
732 |
733 | // Remove single-cell tables
734 | this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
735 | var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
736 | if (this._hasSingleTagInsideElement(tbody, "TR")) {
737 | var row = tbody.firstElementChild;
738 | if (this._hasSingleTagInsideElement(row, "TD")) {
739 | var cell = row.firstElementChild;
740 | cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
741 | table.parentNode.replaceChild(cell, table);
742 | }
743 | }
744 | });
745 | },
746 |
747 | /**
748 | * Initialize a node with the readability object. Also checks the
749 | * className/id for special names to add to its score.
750 | *
751 | * @param Element
752 | * @return void
753 | **/
754 | _initializeNode: function(node) {
755 | node.readability = {"contentScore": 0};
756 |
757 | switch (node.tagName) {
758 | case "DIV":
759 | node.readability.contentScore += 5;
760 | break;
761 |
762 | case "PRE":
763 | case "TD":
764 | case "BLOCKQUOTE":
765 | node.readability.contentScore += 3;
766 | break;
767 |
768 | case "ADDRESS":
769 | case "OL":
770 | case "UL":
771 | case "DL":
772 | case "DD":
773 | case "DT":
774 | case "LI":
775 | case "FORM":
776 | node.readability.contentScore -= 3;
777 | break;
778 |
779 | case "H1":
780 | case "H2":
781 | case "H3":
782 | case "H4":
783 | case "H5":
784 | case "H6":
785 | case "TH":
786 | node.readability.contentScore -= 5;
787 | break;
788 | }
789 |
790 | node.readability.contentScore += this._getClassWeight(node);
791 | },
792 |
793 | _removeAndGetNext: function(node) {
794 | var nextNode = this._getNextNode(node, true);
795 | node.parentNode.removeChild(node);
796 | return nextNode;
797 | },
798 |
799 | /**
800 | * Traverse the DOM from node to node, starting at the node passed in.
801 | * Pass true for the second parameter to indicate this node itself
802 | * (and its kids) are going away, and we want the next node over.
803 | *
804 | * Calling this in a loop will traverse the DOM depth-first.
805 | */
806 | _getNextNode: function(node, ignoreSelfAndKids) {
807 | // First check for kids if those aren't being ignored
808 | if (!ignoreSelfAndKids && node.firstElementChild) {
809 | return node.firstElementChild;
810 | }
811 | // Then for siblings...
812 | if (node.nextElementSibling) {
813 | return node.nextElementSibling;
814 | }
815 | // And finally, move up the parent chain *and* find a sibling
816 | // (because this is depth-first traversal, we will have already
817 | // seen the parent nodes themselves).
818 | do {
819 | node = node.parentNode;
820 | } while (node && !node.nextElementSibling);
821 | return node && node.nextElementSibling;
822 | },
823 |
824 | // compares second text to first one
825 | // 1 = same text, 0 = completely different text
826 | // works the way that it splits both texts into words and then finds words that are unique in second text
827 | // the result is given by the lower length of unique parts
828 | _textSimilarity: function(textA, textB) {
829 | var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
830 | var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
831 | if (!tokensA.length || !tokensB.length) {
832 | return 0;
833 | }
834 | var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
835 | var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
836 | return 1 - distanceB;
837 | },
838 |
839 | _checkByline: function(node, matchString) {
840 | if (this._articleByline) {
841 | return false;
842 | }
843 |
844 | if (node.getAttribute !== undefined) {
845 | var rel = node.getAttribute("rel");
846 | var itemprop = node.getAttribute("itemprop");
847 | }
848 |
849 | if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
850 | this._articleByline = node.textContent.trim();
851 | return true;
852 | }
853 |
854 | return false;
855 | },
856 |
857 | _getNodeAncestors: function(node, maxDepth) {
858 | maxDepth = maxDepth || 0;
859 | var i = 0, ancestors = [];
860 | while (node.parentNode) {
861 | ancestors.push(node.parentNode);
862 | if (maxDepth && ++i === maxDepth)
863 | break;
864 | node = node.parentNode;
865 | }
866 | return ancestors;
867 | },
868 |
869 | /***
870 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
871 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
872 | *
873 | * @param page a document to run upon. Needs to be a full document, complete with body.
874 | * @return Element
875 | **/
876 | _grabArticle: function (page) {
877 | this.log("**** grabArticle ****");
878 | var doc = this._doc;
879 | var isPaging = page !== null;
880 | page = page ? page : this._doc.body;
881 |
882 | // We can't grab an article if we don't have a page!
883 | if (!page) {
884 | this.log("No body found in document. Abort.");
885 | return null;
886 | }
887 |
888 | var pageCacheHtml = page.innerHTML;
889 |
890 | while (true) {
891 | this.log("Starting grabArticle loop");
892 | var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
893 |
894 | // First, node prepping. Trash nodes that look cruddy (like ones with the
895 | // class name "comment", etc), and turn divs into P tags where they have been
896 | // used inappropriately (as in, where they contain no other block level elements.)
897 | var elementsToScore = [];
898 | var node = this._doc.documentElement;
899 |
900 | let shouldRemoveTitleHeader = true;
901 |
902 | while (node) {
903 |
904 | if (node.tagName === "HTML") {
905 | this._articleLang = node.getAttribute("lang");
906 | }
907 |
908 | var matchString = node.className + " " + node.id;
909 |
910 | if (!this._isProbablyVisible(node)) {
911 | this.log("Removing hidden node - " + matchString);
912 | node = this._removeAndGetNext(node);
913 | continue;
914 | }
915 |
916 | // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
917 | if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") {
918 | node = this._removeAndGetNext(node);
919 | continue;
920 | }
921 |
922 | // Check to see if this node is a byline, and remove it if it is.
923 | if (this._checkByline(node, matchString)) {
924 | node = this._removeAndGetNext(node);
925 | continue;
926 | }
927 |
928 | if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
929 | this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim());
930 | shouldRemoveTitleHeader = false;
931 | node = this._removeAndGetNext(node);
932 | continue;
933 | }
934 |
935 | // Remove unlikely candidates
936 | if (stripUnlikelyCandidates) {
937 | if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
938 | !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
939 | !this._hasAncestorTag(node, "table") &&
940 | !this._hasAncestorTag(node, "code") &&
941 | node.tagName !== "BODY" &&
942 | node.tagName !== "A") {
943 | this.log("Removing unlikely candidate - " + matchString);
944 | node = this._removeAndGetNext(node);
945 | continue;
946 | }
947 |
948 | if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
949 | this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString);
950 | node = this._removeAndGetNext(node);
951 | continue;
952 | }
953 | }
954 |
955 | // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
956 | if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
957 | node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
958 | node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
959 | this._isElementWithoutContent(node)) {
960 | node = this._removeAndGetNext(node);
961 | continue;
962 | }
963 |
964 | if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
965 | elementsToScore.push(node);
966 | }
967 |
968 | // Turn all divs that don't have children block level elements into p's
969 | if (node.tagName === "DIV") {
970 | // Put phrasing content into paragraphs.
971 | var p = null;
972 | var childNode = node.firstChild;
973 | while (childNode) {
974 | var nextSibling = childNode.nextSibling;
975 | if (this._isPhrasingContent(childNode)) {
976 | if (p !== null) {
977 | p.appendChild(childNode);
978 | } else if (!this._isWhitespace(childNode)) {
979 | p = doc.createElement("p");
980 | node.replaceChild(p, childNode);
981 | p.appendChild(childNode);
982 | }
983 | } else if (p !== null) {
984 | while (p.lastChild && this._isWhitespace(p.lastChild)) {
985 | p.removeChild(p.lastChild);
986 | }
987 | p = null;
988 | }
989 | childNode = nextSibling;
990 | }
991 |
992 | // Sites like http://mobile.slate.com encloses each paragraph with a DIV
993 | // element. DIVs with only a P element inside and no text content can be
994 | // safely converted into plain P elements to avoid confusing the scoring
995 | // algorithm with DIVs with are, in practice, paragraphs.
996 | if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
997 | var newNode = node.children[0];
998 | node.parentNode.replaceChild(newNode, node);
999 | node = newNode;
1000 | elementsToScore.push(node);
1001 | } else if (!this._hasChildBlockElement(node)) {
1002 | node = this._setNodeTag(node, "P");
1003 | elementsToScore.push(node);
1004 | }
1005 | }
1006 | node = this._getNextNode(node);
1007 | }
1008 |
1009 | /**
1010 | * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1011 | * Then add their score to their parent node.
1012 | *
1013 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1014 | **/
1015 | var candidates = [];
1016 | this._forEachNode(elementsToScore, function(elementToScore) {
1017 | if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
1018 | return;
1019 |
1020 | // If this paragraph is less than 25 characters, don't even count it.
1021 | var innerText = this._getInnerText(elementToScore);
1022 | if (innerText.length < 25)
1023 | return;
1024 |
1025 | // Exclude nodes with no ancestor.
1026 | var ancestors = this._getNodeAncestors(elementToScore, 5);
1027 | if (ancestors.length === 0)
1028 | return;
1029 |
1030 | var contentScore = 0;
1031 |
1032 | // Add a point for the paragraph itself as a base.
1033 | contentScore += 1;
1034 |
1035 | // Add points for any commas within this paragraph.
1036 | contentScore += innerText.split(this.REGEXPS.commas).length;
1037 |
1038 | // For every 100 characters in this paragraph, add another point. Up to 3 points.
1039 | contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1040 |
1041 | // Initialize and score ancestors.
1042 | this._forEachNode(ancestors, function(ancestor, level) {
1043 | if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
1044 | return;
1045 |
1046 | if (typeof(ancestor.readability) === "undefined") {
1047 | this._initializeNode(ancestor);
1048 | candidates.push(ancestor);
1049 | }
1050 |
1051 | // Node score divider:
1052 | // - parent: 1 (no division)
1053 | // - grandparent: 2
1054 | // - great grandparent+: ancestor level * 3
1055 | if (level === 0)
1056 | var scoreDivider = 1;
1057 | else if (level === 1)
1058 | scoreDivider = 2;
1059 | else
1060 | scoreDivider = level * 3;
1061 | ancestor.readability.contentScore += contentScore / scoreDivider;
1062 | });
1063 | });
1064 |
1065 | // After we've calculated scores, loop through all of the possible
1066 | // candidate nodes we found and find the one with the highest score.
1067 | var topCandidates = [];
1068 | for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1069 | var candidate = candidates[c];
1070 |
1071 | // Scale the final candidates score based on link density. Good content
1072 | // should have a relatively small link density (5% or less) and be mostly
1073 | // unaffected by this operation.
1074 | var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
1075 | candidate.readability.contentScore = candidateScore;
1076 |
1077 | this.log("Candidate:", candidate, "with score " + candidateScore);
1078 |
1079 | for (var t = 0; t < this._nbTopCandidates; t++) {
1080 | var aTopCandidate = topCandidates[t];
1081 |
1082 | if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
1083 | topCandidates.splice(t, 0, candidate);
1084 | if (topCandidates.length > this._nbTopCandidates)
1085 | topCandidates.pop();
1086 | break;
1087 | }
1088 | }
1089 | }
1090 |
1091 | var topCandidate = topCandidates[0] || null;
1092 | var neededToCreateTopCandidate = false;
1093 | var parentOfTopCandidate;
1094 |
1095 | // If we still have no top candidate, just use the body as a last resort.
1096 | // We also have to copy the body node so it is something we can modify.
1097 | if (topCandidate === null || topCandidate.tagName === "BODY") {
1098 | // Move all of the page's children into topCandidate
1099 | topCandidate = doc.createElement("DIV");
1100 | neededToCreateTopCandidate = true;
1101 | // Move everything (not just elements, also text nodes etc.) into the container
1102 | // so we even include text directly in the body:
1103 | while (page.firstChild) {
1104 | this.log("Moving child out:", page.firstChild);
1105 | topCandidate.appendChild(page.firstChild);
1106 | }
1107 |
1108 | page.appendChild(topCandidate);
1109 |
1110 | this._initializeNode(topCandidate);
1111 | } else if (topCandidate) {
1112 | // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1113 | // and whose scores are quite closed with current `topCandidate` node.
1114 | var alternativeCandidateAncestors = [];
1115 | for (var i = 1; i < topCandidates.length; i++) {
1116 | if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
1117 | alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
1118 | }
1119 | }
1120 | var MINIMUM_TOPCANDIDATES = 3;
1121 | if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1122 | parentOfTopCandidate = topCandidate.parentNode;
1123 | while (parentOfTopCandidate.tagName !== "BODY") {
1124 | var listsContainingThisAncestor = 0;
1125 | for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
1126 | listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
1127 | }
1128 | if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1129 | topCandidate = parentOfTopCandidate;
1130 | break;
1131 | }
1132 | parentOfTopCandidate = parentOfTopCandidate.parentNode;
1133 | }
1134 | }
1135 | if (!topCandidate.readability) {
1136 | this._initializeNode(topCandidate);
1137 | }
1138 |
1139 | // Because of our bonus system, parents of candidates might have scores
1140 | // themselves. They get half of the node. There won't be nodes with higher
1141 | // scores than our topCandidate, but if we see the score going *up* in the first
1142 | // few steps up the tree, that's a decent sign that there might be more content
1143 | // lurking in other places that we want to unify in. The sibling stuff
1144 | // below does some of that - but only if we've looked high enough up the DOM
1145 | // tree.
1146 | parentOfTopCandidate = topCandidate.parentNode;
1147 | var lastScore = topCandidate.readability.contentScore;
1148 | // The scores shouldn't get too low.
1149 | var scoreThreshold = lastScore / 3;
1150 | while (parentOfTopCandidate.tagName !== "BODY") {
1151 | if (!parentOfTopCandidate.readability) {
1152 | parentOfTopCandidate = parentOfTopCandidate.parentNode;
1153 | continue;
1154 | }
1155 | var parentScore = parentOfTopCandidate.readability.contentScore;
1156 | if (parentScore < scoreThreshold)
1157 | break;
1158 | if (parentScore > lastScore) {
1159 | // Alright! We found a better parent to use.
1160 | topCandidate = parentOfTopCandidate;
1161 | break;
1162 | }
1163 | lastScore = parentOfTopCandidate.readability.contentScore;
1164 | parentOfTopCandidate = parentOfTopCandidate.parentNode;
1165 | }
1166 |
1167 | // If the top candidate is the only child, use parent instead. This will help sibling
1168 | // joining logic when adjacent content is actually located in parent's sibling node.
1169 | parentOfTopCandidate = topCandidate.parentNode;
1170 | while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
1171 | topCandidate = parentOfTopCandidate;
1172 | parentOfTopCandidate = topCandidate.parentNode;
1173 | }
1174 | if (!topCandidate.readability) {
1175 | this._initializeNode(topCandidate);
1176 | }
1177 | }
1178 |
1179 | // Now that we have the top candidate, look through its siblings for content
1180 | // that might also be related. Things like preambles, content split by ads
1181 | // that we removed, etc.
1182 | var articleContent = doc.createElement("DIV");
1183 | if (isPaging)
1184 | articleContent.id = "readability-content";
1185 |
1186 | var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
1187 | // Keep potential top candidate's parent node to try to get text direction of it later.
1188 | parentOfTopCandidate = topCandidate.parentNode;
1189 | var siblings = parentOfTopCandidate.children;
1190 |
1191 | for (var s = 0, sl = siblings.length; s < sl; s++) {
1192 | var sibling = siblings[s];
1193 | var append = false;
1194 |
1195 | this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
1196 | this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
1197 |
1198 | if (sibling === topCandidate) {
1199 | append = true;
1200 | } else {
1201 | var contentBonus = 0;
1202 |
1203 | // Give a bonus if sibling nodes and top candidates have the example same classname
1204 | if (sibling.className === topCandidate.className && topCandidate.className !== "")
1205 | contentBonus += topCandidate.readability.contentScore * 0.2;
1206 |
1207 | if (sibling.readability &&
1208 | ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
1209 | append = true;
1210 | } else if (sibling.nodeName === "P") {
1211 | var linkDensity = this._getLinkDensity(sibling);
1212 | var nodeContent = this._getInnerText(sibling);
1213 | var nodeLength = nodeContent.length;
1214 |
1215 | if (nodeLength > 80 && linkDensity < 0.25) {
1216 | append = true;
1217 | } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
1218 | nodeContent.search(/\.( |$)/) !== -1) {
1219 | append = true;
1220 | }
1221 | }
1222 | }
1223 |
1224 | if (append) {
1225 | this.log("Appending node:", sibling);
1226 |
1227 | if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
1228 | // We have a node that isn't a common block level element, like a form or td tag.
1229 | // Turn it into a div so it doesn't get filtered out later by accident.
1230 | this.log("Altering sibling:", sibling, "to div.");
1231 |
1232 | sibling = this._setNodeTag(sibling, "DIV");
1233 | }
1234 |
1235 | articleContent.appendChild(sibling);
1236 | // Fetch children again to make it compatible
1237 | // with DOM parsers without live collection support.
1238 | siblings = parentOfTopCandidate.children;
1239 | // siblings is a reference to the children array, and
1240 | // sibling is removed from the array when we call appendChild().
1241 | // As a result, we must revisit this index since the nodes
1242 | // have been shifted.
1243 | s -= 1;
1244 | sl -= 1;
1245 | }
1246 | }
1247 |
1248 | if (this._debug)
1249 | this.log("Article content pre-prep: " + articleContent.innerHTML);
1250 | // So we have all of the content that we need. Now we clean it up for presentation.
1251 | this._prepArticle(articleContent);
1252 | if (this._debug)
1253 | this.log("Article content post-prep: " + articleContent.innerHTML);
1254 |
1255 | if (neededToCreateTopCandidate) {
1256 | // We already created a fake div thing, and there wouldn't have been any siblings left
1257 | // for the previous loop, so there's no point trying to create a new div, and then
1258 | // move all the children over. Just assign IDs and class names here. No need to append
1259 | // because that already happened anyway.
1260 | topCandidate.id = "readability-page-1";
1261 | topCandidate.className = "page";
1262 | } else {
1263 | var div = doc.createElement("DIV");
1264 | div.id = "readability-page-1";
1265 | div.className = "page";
1266 | while (articleContent.firstChild) {
1267 | div.appendChild(articleContent.firstChild);
1268 | }
1269 | articleContent.appendChild(div);
1270 | }
1271 |
1272 | if (this._debug)
1273 | this.log("Article content after paging: " + articleContent.innerHTML);
1274 |
1275 | var parseSuccessful = true;
1276 |
1277 | // Now that we've gone through the full algorithm, check to see if
1278 | // we got any meaningful content. If we didn't, we may need to re-run
1279 | // grabArticle with different flags set. This gives us a higher likelihood of
1280 | // finding the content, and the sieve approach gives us a higher likelihood of
1281 | // finding the -right- content.
1282 | var textLength = this._getInnerText(articleContent, true).length;
1283 | if (textLength < this._charThreshold) {
1284 | parseSuccessful = false;
1285 | page.innerHTML = pageCacheHtml;
1286 |
1287 | if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1288 | this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1289 | this._attempts.push({articleContent: articleContent, textLength: textLength});
1290 | } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1291 | this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1292 | this._attempts.push({articleContent: articleContent, textLength: textLength});
1293 | } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1294 | this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1295 | this._attempts.push({articleContent: articleContent, textLength: textLength});
1296 | } else {
1297 | this._attempts.push({articleContent: articleContent, textLength: textLength});
1298 | // No luck after removing flags, just return the longest text we found during the different loops
1299 | this._attempts.sort(function (a, b) {
1300 | return b.textLength - a.textLength;
1301 | });
1302 |
1303 | // But first check if we actually have something
1304 | if (!this._attempts[0].textLength) {
1305 | return null;
1306 | }
1307 |
1308 | articleContent = this._attempts[0].articleContent;
1309 | parseSuccessful = true;
1310 | }
1311 | }
1312 |
1313 | if (parseSuccessful) {
1314 | // Find out text direction from ancestors of final top candidate.
1315 | var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
1316 | this._someNode(ancestors, function(ancestor) {
1317 | if (!ancestor.tagName)
1318 | return false;
1319 | var articleDir = ancestor.getAttribute("dir");
1320 | if (articleDir) {
1321 | this._articleDir = articleDir;
1322 | return true;
1323 | }
1324 | return false;
1325 | });
1326 | return articleContent;
1327 | }
1328 | }
1329 | },
1330 |
1331 | /**
1332 | * Check whether the input string could be a byline.
1333 | * This verifies that the input is a string, and that the length
1334 | * is less than 100 chars.
1335 | *
1336 | * @param possibleByline {string} - a string to check whether its a byline.
1337 | * @return Boolean - whether the input string is a byline.
1338 | */
1339 | _isValidByline: function(byline) {
1340 | if (typeof byline == "string" || byline instanceof String) {
1341 | byline = byline.trim();
1342 | return (byline.length > 0) && (byline.length < 100);
1343 | }
1344 | return false;
1345 | },
1346 |
1347 | /**
1348 | * Converts some of the common HTML entities in string to their corresponding characters.
1349 | *
1350 | * @param str {string} - a string to unescape.
1351 | * @return string without HTML entity.
1352 | */
1353 | _unescapeHtmlEntities: function(str) {
1354 | if (!str) {
1355 | return str;
1356 | }
1357 |
1358 | var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1359 | return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
1360 | return htmlEscapeMap[tag];
1361 | }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
1362 | var num = parseInt(hex || numStr, hex ? 16 : 10);
1363 | return String.fromCharCode(num);
1364 | });
1365 | },
1366 |
1367 | /**
1368 | * Try to extract metadata from JSON-LD object.
1369 | * For now, only Schema.org objects of type Article or its subtypes are supported.
1370 | * @return Object with any metadata that could be extracted (possibly none)
1371 | */
1372 | _getJSONLD: function (doc) {
1373 | var scripts = this._getAllNodesWithTag(doc, ["script"]);
1374 |
1375 | var metadata;
1376 |
1377 | this._forEachNode(scripts, function(jsonLdElement) {
1378 | if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") {
1379 | try {
1380 | // Strip CDATA markers if present
1381 | var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
1382 | var parsed = JSON.parse(content);
1383 | if (
1384 | !parsed["@context"] ||
1385 | !parsed["@context"].match(/^https?\:\/\/schema\.org$/)
1386 | ) {
1387 | return;
1388 | }
1389 |
1390 | if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
1391 | parsed = parsed["@graph"].find(function(it) {
1392 | return (it["@type"] || "").match(
1393 | this.REGEXPS.jsonLdArticleTypes
1394 | );
1395 | });
1396 | }
1397 |
1398 | if (
1399 | !parsed ||
1400 | !parsed["@type"] ||
1401 | !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1402 | ) {
1403 | return;
1404 | }
1405 |
1406 | metadata = {};
1407 |
1408 | if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
1409 | // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1410 | // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1411 | // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1412 |
1413 | var title = this._getArticleTitle();
1414 | var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1415 | var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1416 |
1417 | if (headlineMatches && !nameMatches) {
1418 | metadata.title = parsed.headline;
1419 | } else {
1420 | metadata.title = parsed.name;
1421 | }
1422 | } else if (typeof parsed.name === "string") {
1423 | metadata.title = parsed.name.trim();
1424 | } else if (typeof parsed.headline === "string") {
1425 | metadata.title = parsed.headline.trim();
1426 | }
1427 | if (parsed.author) {
1428 | if (typeof parsed.author.name === "string") {
1429 | metadata.byline = parsed.author.name.trim();
1430 | } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
1431 | metadata.byline = parsed.author
1432 | .filter(function(author) {
1433 | return author && typeof author.name === "string";
1434 | })
1435 | .map(function(author) {
1436 | return author.name.trim();
1437 | })
1438 | .join(", ");
1439 | }
1440 | }
1441 | if (typeof parsed.description === "string") {
1442 | metadata.excerpt = parsed.description.trim();
1443 | }
1444 | if (
1445 | parsed.publisher &&
1446 | typeof parsed.publisher.name === "string"
1447 | ) {
1448 | metadata.siteName = parsed.publisher.name.trim();
1449 | }
1450 | if (typeof parsed.datePublished === "string") {
1451 | metadata.datePublished = parsed.datePublished.trim();
1452 | }
1453 | return;
1454 | } catch (err) {
1455 | this.log(err.message);
1456 | }
1457 | }
1458 | });
1459 | return metadata ? metadata : {};
1460 | },
1461 |
1462 | /**
1463 | * Attempts to get excerpt and byline metadata for the article.
1464 | *
1465 | * @param {Object} jsonld — object containing any metadata that
1466 | * could be extracted from JSON-LD object.
1467 | *
1468 | * @return Object with optional "excerpt" and "byline" properties
1469 | */
1470 | _getArticleMetadata: function(jsonld) {
1471 | var metadata = {};
1472 | var values = {};
1473 | var metaElements = this._doc.getElementsByTagName("meta");
1474 |
1475 | // property is a space-separated list of values
1476 | var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
1477 |
1478 | // name is a single value
1479 | var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
1480 |
1481 | // Find description tags.
1482 | this._forEachNode(metaElements, function(element) {
1483 | var elementName = element.getAttribute("name");
1484 | var elementProperty = element.getAttribute("property");
1485 | var content = element.getAttribute("content");
1486 | if (!content) {
1487 | return;
1488 | }
1489 | var matches = null;
1490 | var name = null;
1491 |
1492 | if (elementProperty) {
1493 | matches = elementProperty.match(propertyPattern);
1494 | if (matches) {
1495 | // Convert to lowercase, and remove any whitespace
1496 | // so we can match below.
1497 | name = matches[0].toLowerCase().replace(/\s/g, "");
1498 | // multiple authors
1499 | values[name] = content.trim();
1500 | }
1501 | }
1502 | if (!matches && elementName && namePattern.test(elementName)) {
1503 | name = elementName;
1504 | if (content) {
1505 | // Convert to lowercase, remove any whitespace, and convert dots
1506 | // to colons so we can match below.
1507 | name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
1508 | values[name] = content.trim();
1509 | }
1510 | }
1511 | });
1512 |
1513 | // get title
1514 | metadata.title = jsonld.title ||
1515 | values["dc:title"] ||
1516 | values["dcterm:title"] ||
1517 | values["og:title"] ||
1518 | values["weibo:article:title"] ||
1519 | values["weibo:webpage:title"] ||
1520 | values["title"] ||
1521 | values["twitter:title"];
1522 |
1523 | if (!metadata.title) {
1524 | metadata.title = this._getArticleTitle();
1525 | }
1526 |
1527 | // get author
1528 | metadata.byline = jsonld.byline ||
1529 | values["dc:creator"] ||
1530 | values["dcterm:creator"] ||
1531 | values["author"];
1532 |
1533 | // get description
1534 | metadata.excerpt = jsonld.excerpt ||
1535 | values["dc:description"] ||
1536 | values["dcterm:description"] ||
1537 | values["og:description"] ||
1538 | values["weibo:article:description"] ||
1539 | values["weibo:webpage:description"] ||
1540 | values["description"] ||
1541 | values["twitter:description"];
1542 |
1543 | // get site name
1544 | metadata.siteName = jsonld.siteName ||
1545 | values["og:site_name"];
1546 |
1547 | // get article published time
1548 | metadata.publishedTime = jsonld.datePublished ||
1549 | values["article:published_time"] || null;
1550 |
1551 | // in many sites the meta value is escaped with HTML entities,
1552 | // so here we need to unescape it
1553 | metadata.title = this._unescapeHtmlEntities(metadata.title);
1554 | metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1555 | metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1556 | metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1557 | metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1558 |
1559 | return metadata;
1560 | },
1561 |
1562 | /**
1563 | * Check if node is image, or if node contains exactly only one image
1564 | * whether as a direct child or as its descendants.
1565 | *
1566 | * @param Element
1567 | **/
1568 | _isSingleImage: function(node) {
1569 | if (node.tagName === "IMG") {
1570 | return true;
1571 | }
1572 |
1573 | if (node.children.length !== 1 || node.textContent.trim() !== "") {
1574 | return false;
1575 | }
1576 |
1577 | return this._isSingleImage(node.children[0]);
1578 | },
1579 |
1580 | /**
1581 | * Find all <noscript> that are located after <img> nodes, and which contain only one
1582 | * <img> element. Replace the first image with the image from inside the <noscript> tag,
1583 | * and remove the <noscript> tag. This improves the quality of the images we use on
1584 | * some sites (e.g. Medium).
1585 | *
1586 | * @param Element
1587 | **/
1588 | _unwrapNoscriptImages: function(doc) {
1589 | // Find img without source or attributes that might contains image, and remove it.
1590 | // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1591 | var imgs = Array.from(doc.getElementsByTagName("img"));
1592 | this._forEachNode(imgs, function(img) {
1593 | for (var i = 0; i < img.attributes.length; i++) {
1594 | var attr = img.attributes[i];
1595 | switch (attr.name) {
1596 | case "src":
1597 | case "srcset":
1598 | case "data-src":
1599 | case "data-srcset":
1600 | return;
1601 | }
1602 |
1603 | if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1604 | return;
1605 | }
1606 | }
1607 |
1608 | img.parentNode.removeChild(img);
1609 | });
1610 |
1611 | // Next find noscript and try to extract its image
1612 | var noscripts = Array.from(doc.getElementsByTagName("noscript"));
1613 | this._forEachNode(noscripts, function(noscript) {
1614 | // Parse content of noscript and make sure it only contains image
1615 | var tmp = doc.createElement("div");
1616 | tmp.innerHTML = noscript.innerHTML;
1617 | if (!this._isSingleImage(tmp)) {
1618 | return;
1619 | }
1620 |
1621 | // If noscript has previous sibling and it only contains image,
1622 | // replace it with noscript content. However we also keep old
1623 | // attributes that might contains image.
1624 | var prevElement = noscript.previousElementSibling;
1625 | if (prevElement && this._isSingleImage(prevElement)) {
1626 | var prevImg = prevElement;
1627 | if (prevImg.tagName !== "IMG") {
1628 | prevImg = prevElement.getElementsByTagName("img")[0];
1629 | }
1630 |
1631 | var newImg = tmp.getElementsByTagName("img")[0];
1632 | for (var i = 0; i < prevImg.attributes.length; i++) {
1633 | var attr = prevImg.attributes[i];
1634 | if (attr.value === "") {
1635 | continue;
1636 | }
1637 |
1638 | if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1639 | if (newImg.getAttribute(attr.name) === attr.value) {
1640 | continue;
1641 | }
1642 |
1643 | var attrName = attr.name;
1644 | if (newImg.hasAttribute(attrName)) {
1645 | attrName = "data-old-" + attrName;
1646 | }
1647 |
1648 | newImg.setAttribute(attrName, attr.value);
1649 | }
1650 | }
1651 |
1652 | noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1653 | }
1654 | });
1655 | },
1656 |
1657 | /**
1658 | * Removes script tags from the document.
1659 | *
1660 | * @param Element
1661 | **/
1662 | _removeScripts: function(doc) {
1663 | this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
1664 | },
1665 |
1666 | /**
1667 | * Check if this node has only whitespace and a single element with given tag
1668 | * Returns false if the DIV node contains non-empty text nodes
1669 | * or if it contains no element with given tag or more than 1 element.
1670 | *
1671 | * @param Element
1672 | * @param string tag of child element
1673 | **/
1674 | _hasSingleTagInsideElement: function(element, tag) {
1675 | // There should be exactly 1 element child with given tag
1676 | if (element.children.length != 1 || element.children[0].tagName !== tag) {
1677 | return false;
1678 | }
1679 |
1680 | // And there should be no text nodes with real content
1681 | return !this._someNode(element.childNodes, function(node) {
1682 | return node.nodeType === this.TEXT_NODE &&
1683 | this.REGEXPS.hasContent.test(node.textContent);
1684 | });
1685 | },
1686 |
1687 | _isElementWithoutContent: function(node) {
1688 | return node.nodeType === this.ELEMENT_NODE &&
1689 | node.textContent.trim().length == 0 &&
1690 | (node.children.length == 0 ||
1691 | node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
1692 | },
1693 |
1694 | /**
1695 | * Determine whether element has any children block level elements.
1696 | *
1697 | * @param Element
1698 | */
1699 | _hasChildBlockElement: function (element) {
1700 | return this._someNode(element.childNodes, function(node) {
1701 | return this.DIV_TO_P_ELEMS.has(node.tagName) ||
1702 | this._hasChildBlockElement(node);
1703 | });
1704 | },
1705 |
1706 | /***
1707 | * Determine if a node qualifies as phrasing content.
1708 | * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
1709 | **/
1710 | _isPhrasingContent: function(node) {
1711 | return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
1712 | ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
1713 | this._everyNode(node.childNodes, this._isPhrasingContent));
1714 | },
1715 |
1716 | _isWhitespace: function(node) {
1717 | return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
1718 | (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
1719 | },
1720 |
1721 | /**
1722 | * Get the inner text of a node - cross browser compatibly.
1723 | * This also strips out any excess whitespace to be found.
1724 | *
1725 | * @param Element
1726 | * @param Boolean normalizeSpaces (default: true)
1727 | * @return string
1728 | **/
1729 | _getInnerText: function(e, normalizeSpaces) {
1730 | normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
1731 | var textContent = e.textContent.trim();
1732 |
1733 | if (normalizeSpaces) {
1734 | return textContent.replace(this.REGEXPS.normalize, " ");
1735 | }
1736 | return textContent;
1737 | },
1738 |
1739 | /**
1740 | * Get the number of times a string s appears in the node e.
1741 | *
1742 | * @param Element
1743 | * @param string - what to split on. Default is ","
1744 | * @return number (integer)
1745 | **/
1746 | _getCharCount: function(e, s) {
1747 | s = s || ",";
1748 | return this._getInnerText(e).split(s).length - 1;
1749 | },
1750 |
1751 | /**
1752 | * Remove the style attribute on every e and under.
1753 | * TODO: Test if getElementsByTagName(*) is faster.
1754 | *
1755 | * @param Element
1756 | * @return void
1757 | **/
1758 | _cleanStyles: function(e) {
1759 | if (!e || e.tagName.toLowerCase() === "svg")
1760 | return;
1761 |
1762 | // Remove `style` and deprecated presentational attributes
1763 | for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
1764 | e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
1765 | }
1766 |
1767 | if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
1768 | e.removeAttribute("width");
1769 | e.removeAttribute("height");
1770 | }
1771 |
1772 | var cur = e.firstElementChild;
1773 | while (cur !== null) {
1774 | this._cleanStyles(cur);
1775 | cur = cur.nextElementSibling;
1776 | }
1777 | },
1778 |
1779 | /**
1780 | * Get the density of links as a percentage of the content
1781 | * This is the amount of text that is inside a link divided by the total text in the node.
1782 | *
1783 | * @param Element
1784 | * @return number (float)
1785 | **/
1786 | _getLinkDensity: function(element) {
1787 | var textLength = this._getInnerText(element).length;
1788 | if (textLength === 0)
1789 | return 0;
1790 |
1791 | var linkLength = 0;
1792 |
1793 | // XXX implement _reduceNodeList?
1794 | this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
1795 | var href = linkNode.getAttribute("href");
1796 | var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
1797 | linkLength += this._getInnerText(linkNode).length * coefficient;
1798 | });
1799 |
1800 | return linkLength / textLength;
1801 | },
1802 |
1803 | /**
1804 | * Get an elements class/id weight. Uses regular expressions to tell if this
1805 | * element looks good or bad.
1806 | *
1807 | * @param Element
1808 | * @return number (Integer)
1809 | **/
1810 | _getClassWeight: function(e) {
1811 | if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
1812 | return 0;
1813 |
1814 | var weight = 0;
1815 |
1816 | // Look for a special classname
1817 | if (typeof(e.className) === "string" && e.className !== "") {
1818 | if (this.REGEXPS.negative.test(e.className))
1819 | weight -= 25;
1820 |
1821 | if (this.REGEXPS.positive.test(e.className))
1822 | weight += 25;
1823 | }
1824 |
1825 | // Look for a special ID
1826 | if (typeof(e.id) === "string" && e.id !== "") {
1827 | if (this.REGEXPS.negative.test(e.id))
1828 | weight -= 25;
1829 |
1830 | if (this.REGEXPS.positive.test(e.id))
1831 | weight += 25;
1832 | }
1833 |
1834 | return weight;
1835 | },
1836 |
1837 | /**
1838 | * Clean a node of all elements of type "tag".
1839 | * (Unless it's a youtube/vimeo video. People love movies.)
1840 | *
1841 | * @param Element
1842 | * @param string tag to clean
1843 | * @return void
1844 | **/
1845 | _clean: function(e, tag) {
1846 | var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
1847 |
1848 | this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
1849 | // Allow youtube and vimeo videos through as people usually want to see those.
1850 | if (isEmbed) {
1851 | // First, check the elements attributes to see if any of them contain youtube or vimeo
1852 | for (var i = 0; i < element.attributes.length; i++) {
1853 | if (this._allowedVideoRegex.test(element.attributes[i].value)) {
1854 | return false;
1855 | }
1856 | }
1857 |
1858 | // For embed with <object> tag, check inner HTML as well.
1859 | if (element.tagName === "object" && this._allowedVideoRegex.test(element.innerHTML)) {
1860 | return false;
1861 | }
1862 | }
1863 |
1864 | return true;
1865 | });
1866 | },
1867 |
1868 | /**
1869 | * Check if a given node has one of its ancestor tag name matching the
1870 | * provided one.
1871 | * @param HTMLElement node
1872 | * @param String tagName
1873 | * @param Number maxDepth
1874 | * @param Function filterFn a filter to invoke to determine whether this node 'counts'
1875 | * @return Boolean
1876 | */
1877 | _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
1878 | maxDepth = maxDepth || 3;
1879 | tagName = tagName.toUpperCase();
1880 | var depth = 0;
1881 | while (node.parentNode) {
1882 | if (maxDepth > 0 && depth > maxDepth)
1883 | return false;
1884 | if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
1885 | return true;
1886 | node = node.parentNode;
1887 | depth++;
1888 | }
1889 | return false;
1890 | },
1891 |
1892 | /**
1893 | * Return an object indicating how many rows and columns this table has.
1894 | */
1895 | _getRowAndColumnCount: function(table) {
1896 | var rows = 0;
1897 | var columns = 0;
1898 | var trs = table.getElementsByTagName("tr");
1899 | for (var i = 0; i < trs.length; i++) {
1900 | var rowspan = trs[i].getAttribute("rowspan") || 0;
1901 | if (rowspan) {
1902 | rowspan = parseInt(rowspan, 10);
1903 | }
1904 | rows += (rowspan || 1);
1905 |
1906 | // Now look for column-related info
1907 | var columnsInThisRow = 0;
1908 | var cells = trs[i].getElementsByTagName("td");
1909 | for (var j = 0; j < cells.length; j++) {
1910 | var colspan = cells[j].getAttribute("colspan") || 0;
1911 | if (colspan) {
1912 | colspan = parseInt(colspan, 10);
1913 | }
1914 | columnsInThisRow += (colspan || 1);
1915 | }
1916 | columns = Math.max(columns, columnsInThisRow);
1917 | }
1918 | return {rows: rows, columns: columns};
1919 | },
1920 |
1921 | /**
1922 | * Look for 'data' (as opposed to 'layout') tables, for which we use
1923 | * similar checks as
1924 | * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
1925 | */
1926 | _markDataTables: function(root) {
1927 | var tables = root.getElementsByTagName("table");
1928 | for (var i = 0; i < tables.length; i++) {
1929 | var table = tables[i];
1930 | var role = table.getAttribute("role");
1931 | if (role == "presentation") {
1932 | table._readabilityDataTable = false;
1933 | continue;
1934 | }
1935 | var datatable = table.getAttribute("datatable");
1936 | if (datatable == "0") {
1937 | table._readabilityDataTable = false;
1938 | continue;
1939 | }
1940 | var summary = table.getAttribute("summary");
1941 | if (summary) {
1942 | table._readabilityDataTable = true;
1943 | continue;
1944 | }
1945 |
1946 | var caption = table.getElementsByTagName("caption")[0];
1947 | if (caption && caption.childNodes.length > 0) {
1948 | table._readabilityDataTable = true;
1949 | continue;
1950 | }
1951 |
1952 | // If the table has a descendant with any of these tags, consider a data table:
1953 | var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
1954 | var descendantExists = function(tag) {
1955 | return !!table.getElementsByTagName(tag)[0];
1956 | };
1957 | if (dataTableDescendants.some(descendantExists)) {
1958 | this.log("Data table because found data-y descendant");
1959 | table._readabilityDataTable = true;
1960 | continue;
1961 | }
1962 |
1963 | // Nested tables indicate a layout table:
1964 | if (table.getElementsByTagName("table")[0]) {
1965 | table._readabilityDataTable = false;
1966 | continue;
1967 | }
1968 |
1969 | var sizeInfo = this._getRowAndColumnCount(table);
1970 | if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
1971 | table._readabilityDataTable = true;
1972 | continue;
1973 | }
1974 | // Now just go by size entirely:
1975 | table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
1976 | }
1977 | },
1978 |
1979 | /* convert images and figures that have properties like data-src into images that can be loaded without JS */
1980 | _fixLazyImages: function (root) {
1981 | this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
1982 | // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
1983 | // So, here we check if the data uri is too short, just might as well remove it.
1984 | if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
1985 | // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
1986 | var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
1987 | if (parts[1] === "image/svg+xml") {
1988 | return;
1989 | }
1990 |
1991 | // Make sure this element has other attributes which contains image.
1992 | // If it doesn't, then this src is important and shouldn't be removed.
1993 | var srcCouldBeRemoved = false;
1994 | for (var i = 0; i < elem.attributes.length; i++) {
1995 | var attr = elem.attributes[i];
1996 | if (attr.name === "src") {
1997 | continue;
1998 | }
1999 |
2000 | if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2001 | srcCouldBeRemoved = true;
2002 | break;
2003 | }
2004 | }
2005 |
2006 | // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
2007 | // it will be too small, therefore it might be placeholder image.
2008 | if (srcCouldBeRemoved) {
2009 | var b64starts = elem.src.search(/base64\s*/i) + 7;
2010 | var b64length = elem.src.length - b64starts;
2011 | if (b64length < 133) {
2012 | elem.removeAttribute("src");
2013 | }
2014 | }
2015 | }
2016 |
2017 | // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2018 | if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
2019 | return;
2020 | }
2021 |
2022 | for (var j = 0; j < elem.attributes.length; j++) {
2023 | attr = elem.attributes[j];
2024 | if (attr.name === "src" || attr.name === "srcset" || attr.name === "alt") {
2025 | continue;
2026 | }
2027 | var copyTo = null;
2028 | if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
2029 | copyTo = "srcset";
2030 | } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
2031 | copyTo = "src";
2032 | }
2033 | if (copyTo) {
2034 | //if this is an img or picture, set the attribute directly
2035 | if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
2036 | elem.setAttribute(copyTo, attr.value);
2037 | } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
2038 | //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2039 | //see the nytimes-3 testcase for an example
2040 | var img = this._doc.createElement("img");
2041 | img.setAttribute(copyTo, attr.value);
2042 | elem.appendChild(img);
2043 | }
2044 | }
2045 | }
2046 | });
2047 | },
2048 |
2049 | _getTextDensity: function(e, tags) {
2050 | var textLength = this._getInnerText(e, true).length;
2051 | if (textLength === 0) {
2052 | return 0;
2053 | }
2054 | var childrenLength = 0;
2055 | var children = this._getAllNodesWithTag(e, tags);
2056 | this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length);
2057 | return childrenLength / textLength;
2058 | },
2059 |
2060 | /**
2061 | * Clean an element of all tags of type "tag" if they look fishy.
2062 | * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2063 | *
2064 | * @return void
2065 | **/
2066 | _cleanConditionally: function(e, tag) {
2067 | if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
2068 | return;
2069 |
2070 | // Gather counts for other typical elements embedded within.
2071 | // Traverse backwards so we can remove nodes at the same time
2072 | // without effecting the traversal.
2073 | //
2074 | // TODO: Consider taking into account original contentScore here.
2075 | this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
2076 | // First check if this node IS data table, in which case don't remove it.
2077 | var isDataTable = function(t) {
2078 | return t._readabilityDataTable;
2079 | };
2080 |
2081 | var isList = tag === "ul" || tag === "ol";
2082 | if (!isList) {
2083 | var listLength = 0;
2084 | var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
2085 | this._forEachNode(listNodes, (list) => listLength += this._getInnerText(list).length);
2086 | isList = listLength / this._getInnerText(node).length > 0.9;
2087 | }
2088 |
2089 | if (tag === "table" && isDataTable(node)) {
2090 | return false;
2091 | }
2092 |
2093 | // Next check if we're inside a data table, in which case don't remove it as well.
2094 | if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
2095 | return false;
2096 | }
2097 |
2098 | if (this._hasAncestorTag(node, "code")) {
2099 | return false;
2100 | }
2101 |
2102 | var weight = this._getClassWeight(node);
2103 |
2104 | this.log("Cleaning Conditionally", node);
2105 |
2106 | var contentScore = 0;
2107 |
2108 | if (weight + contentScore < 0) {
2109 | return true;
2110 | }
2111 |
2112 | if (this._getCharCount(node, ",") < 10) {
2113 | // If there are not very many commas, and the number of
2114 | // non-paragraph elements is more than paragraphs or other
2115 | // ominous signs, remove the element.
2116 | var p = node.getElementsByTagName("p").length;
2117 | var img = node.getElementsByTagName("img").length;
2118 | var li = node.getElementsByTagName("li").length - 100;
2119 | var input = node.getElementsByTagName("input").length;
2120 | var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]);
2121 |
2122 | var embedCount = 0;
2123 | var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
2124 |
2125 | for (var i = 0; i < embeds.length; i++) {
2126 | // If this embed has attribute that matches video regex, don't delete it.
2127 | for (var j = 0; j < embeds[i].attributes.length; j++) {
2128 | if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2129 | return false;
2130 | }
2131 | }
2132 |
2133 | // For embed with <object> tag, check inner HTML as well.
2134 | if (embeds[i].tagName === "object" && this._allowedVideoRegex.test(embeds[i].innerHTML)) {
2135 | return false;
2136 | }
2137 |
2138 | embedCount++;
2139 | }
2140 |
2141 | var linkDensity = this._getLinkDensity(node);
2142 | var contentLength = this._getInnerText(node).length;
2143 |
2144 | var haveToRemove =
2145 | (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
2146 | (!isList && li > p) ||
2147 | (input > Math.floor(p/3)) ||
2148 | (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
2149 | (!isList && weight < 25 && linkDensity > 0.2) ||
2150 | (weight >= 25 && linkDensity > 0.5) ||
2151 | ((embedCount === 1 && contentLength < 75) || embedCount > 1);
2152 | // Allow simple lists of images to remain in pages
2153 | if (isList && haveToRemove) {
2154 | for (var x = 0; x < node.children.length; x++) {
2155 | let child = node.children[x];
2156 | // Don't filter in lists with li's that contain more than one child
2157 | if (child.children.length > 1) {
2158 | return haveToRemove;
2159 | }
2160 | }
2161 | let li_count = node.getElementsByTagName("li").length;
2162 | // Only allow the list to remain if every li contains an image
2163 | if (img == li_count) {
2164 | return false;
2165 | }
2166 | }
2167 | return haveToRemove;
2168 | }
2169 | return false;
2170 | });
2171 | },
2172 |
2173 | /**
2174 | * Clean out elements that match the specified conditions
2175 | *
2176 | * @param Element
2177 | * @param Function determines whether a node should be removed
2178 | * @return void
2179 | **/
2180 | _cleanMatchedNodes: function(e, filter) {
2181 | var endOfSearchMarkerNode = this._getNextNode(e, true);
2182 | var next = this._getNextNode(e);
2183 | while (next && next != endOfSearchMarkerNode) {
2184 | if (filter.call(this, next, next.className + " " + next.id)) {
2185 | next = this._removeAndGetNext(next);
2186 | } else {
2187 | next = this._getNextNode(next);
2188 | }
2189 | }
2190 | },
2191 |
2192 | /**
2193 | * Clean out spurious headers from an Element.
2194 | *
2195 | * @param Element
2196 | * @return void
2197 | **/
2198 | _cleanHeaders: function(e) {
2199 | let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
2200 | this._removeNodes(headingNodes, function(node) {
2201 | let shouldRemove = this._getClassWeight(node) < 0;
2202 | if (shouldRemove) {
2203 | this.log("Removing header with low class weight:", node);
2204 | }
2205 | return shouldRemove;
2206 | });
2207 | },
2208 |
2209 | /**
2210 | * Check if this node is an H1 or H2 element whose content is mostly
2211 | * the same as the article title.
2212 | *
2213 | * @param Element the node to check.
2214 | * @return boolean indicating whether this is a title-like header.
2215 | */
2216 | _headerDuplicatesTitle: function(node) {
2217 | if (node.tagName != "H1" && node.tagName != "H2") {
2218 | return false;
2219 | }
2220 | var heading = this._getInnerText(node, false);
2221 | this.log("Evaluating similarity of header:", heading, this._articleTitle);
2222 | return this._textSimilarity(this._articleTitle, heading) > 0.75;
2223 | },
2224 |
2225 | _flagIsActive: function(flag) {
2226 | return (this._flags & flag) > 0;
2227 | },
2228 |
2229 | _removeFlag: function(flag) {
2230 | this._flags = this._flags & ~flag;
2231 | },
2232 |
2233 | _isProbablyVisible: function(node) {
2234 | // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
2235 | return (!node.style || node.style.display != "none")
2236 | && (!node.style || node.style.visibility != "hidden")
2237 | && !node.hasAttribute("hidden")
2238 | //check for "fallback-image" so that wikimedia math images are displayed
2239 | && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
2240 | },
2241 |
2242 | /**
2243 | * Runs readability.
2244 | *
2245 | * Workflow:
2246 | * 1. Prep the document by removing script tags, css, etc.
2247 | * 2. Build readability's DOM tree.
2248 | * 3. Grab the article content from the current dom tree.
2249 | * 4. Replace the current DOM tree with the new one.
2250 | * 5. Read peacefully.
2251 | *
2252 | * @return void
2253 | **/
2254 | parse: function () {
2255 | // Avoid parsing too large documents, as per configuration option
2256 | if (this._maxElemsToParse > 0) {
2257 | var numTags = this._doc.getElementsByTagName("*").length;
2258 | if (numTags > this._maxElemsToParse) {
2259 | throw new Error("Aborting parsing document; " + numTags + " elements found");
2260 | }
2261 | }
2262 |
2263 | // Unwrap image from noscript
2264 | this._unwrapNoscriptImages(this._doc);
2265 |
2266 | // Extract JSON-LD metadata before removing scripts
2267 | var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2268 |
2269 | // Remove script tags from the document.
2270 | this._removeScripts(this._doc);
2271 |
2272 | this._prepDocument();
2273 |
2274 | var metadata = this._getArticleMetadata(jsonLd);
2275 | this._articleTitle = metadata.title;
2276 |
2277 | var articleContent = this._grabArticle();
2278 | if (!articleContent)
2279 | return null;
2280 |
2281 | this.log("Grabbed: " + articleContent.innerHTML);
2282 |
2283 | this._postProcessContent(articleContent);
2284 |
2285 | // If we haven't found an excerpt in the article's metadata, use the article's
2286 | // first paragraph as the excerpt. This is used for displaying a preview of
2287 | // the article's content.
2288 | if (!metadata.excerpt) {
2289 | var paragraphs = articleContent.getElementsByTagName("p");
2290 | if (paragraphs.length > 0) {
2291 | metadata.excerpt = paragraphs[0].textContent.trim();
2292 | }
2293 | }
2294 |
2295 | var textContent = articleContent.textContent;
2296 | return {
2297 | title: this._articleTitle,
2298 | byline: metadata.byline || this._articleByline,
2299 | dir: this._articleDir,
2300 | lang: this._articleLang,
2301 | content: this._serializer(articleContent),
2302 | textContent: textContent,
2303 | length: textContent.length,
2304 | excerpt: metadata.excerpt,
2305 | siteName: metadata.siteName || this._articleSiteName,
2306 | publishedTime: metadata.publishedTime
2307 | };
2308 | }
2309 | };
2310 |
2311 | if (typeof module === "object") {
2312 | /* global module */
2313 | module.exports = Readability;
2314 | }
2315 |
```