dicklesworthstone/llm_gateway_mcp

This is page 25 of 45. Use http://codebase.md/dicklesworthstone/llm_gateway_mcp_server?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .env.example
├── .envrc
├── .gitignore
├── additional_features.md
├── check_api_keys.py
├── completion_support.py
├── comprehensive_test.py
├── docker-compose.yml
├── Dockerfile
├── empirically_measured_model_speeds.json
├── error_handling.py
├── example_structured_tool.py
├── examples
│   ├── __init__.py
│   ├── advanced_agent_flows_using_unified_memory_system_demo.py
│   ├── advanced_extraction_demo.py
│   ├── advanced_unified_memory_system_demo.py
│   ├── advanced_vector_search_demo.py
│   ├── analytics_reporting_demo.py
│   ├── audio_transcription_demo.py
│   ├── basic_completion_demo.py
│   ├── cache_demo.py
│   ├── claude_integration_demo.py
│   ├── compare_synthesize_demo.py
│   ├── cost_optimization.py
│   ├── data
│   │   ├── sample_event.txt
│   │   ├── Steve_Jobs_Introducing_The_iPhone_compressed.md
│   │   └── Steve_Jobs_Introducing_The_iPhone_compressed.mp3
│   ├── docstring_refiner_demo.py
│   ├── document_conversion_and_processing_demo.py
│   ├── entity_relation_graph_demo.py
│   ├── filesystem_operations_demo.py
│   ├── grok_integration_demo.py
│   ├── local_text_tools_demo.py
│   ├── marqo_fused_search_demo.py
│   ├── measure_model_speeds.py
│   ├── meta_api_demo.py
│   ├── multi_provider_demo.py
│   ├── ollama_integration_demo.py
│   ├── prompt_templates_demo.py
│   ├── python_sandbox_demo.py
│   ├── rag_example.py
│   ├── research_workflow_demo.py
│   ├── sample
│   │   ├── article.txt
│   │   ├── backprop_paper.pdf
│   │   ├── buffett.pdf
│   │   ├── contract_link.txt
│   │   ├── legal_contract.txt
│   │   ├── medical_case.txt
│   │   ├── northwind.db
│   │   ├── research_paper.txt
│   │   ├── sample_data.json
│   │   └── text_classification_samples
│   │       ├── email_classification.txt
│   │       ├── news_samples.txt
│   │       ├── product_reviews.txt
│   │       └── support_tickets.txt
│   ├── sample_docs
│   │   └── downloaded
│   │       └── attention_is_all_you_need.pdf
│   ├── sentiment_analysis_demo.py
│   ├── simple_completion_demo.py
│   ├── single_shot_synthesis_demo.py
│   ├── smart_browser_demo.py
│   ├── sql_database_demo.py
│   ├── sse_client_demo.py
│   ├── test_code_extraction.py
│   ├── test_content_detection.py
│   ├── test_ollama.py
│   ├── text_classification_demo.py
│   ├── text_redline_demo.py
│   ├── tool_composition_examples.py
│   ├── tournament_code_demo.py
│   ├── tournament_text_demo.py
│   ├── unified_memory_system_demo.py
│   ├── vector_search_demo.py
│   ├── web_automation_instruction_packs.py
│   └── workflow_delegation_demo.py
├── LICENSE
├── list_models.py
├── marqo_index_config.json.example
├── mcp_protocol_schema_2025-03-25_version.json
├── mcp_python_lib_docs.md
├── mcp_tool_context_estimator.py
├── model_preferences.py
├── pyproject.toml
├── quick_test.py
├── README.md
├── resource_annotations.py
├── run_all_demo_scripts_and_check_for_errors.py
├── storage
│   └── smart_browser_internal
│       ├── locator_cache.db
│       ├── readability.js
│       └── storage_state.enc
├── test_client.py
├── test_connection.py
├── TEST_README.md
├── test_sse_client.py
├── test_stdio_client.py
├── tests
│   ├── __init__.py
│   ├── conftest.py
│   ├── integration
│   │   ├── __init__.py
│   │   └── test_server.py
│   ├── manual
│   │   ├── test_extraction_advanced.py
│   │   └── test_extraction.py
│   └── unit
│       ├── __init__.py
│       ├── test_cache.py
│       ├── test_providers.py
│       └── test_tools.py
├── TODO.md
├── tool_annotations.py
├── tools_list.json
├── ultimate_mcp_banner.webp
├── ultimate_mcp_logo.webp
├── ultimate_mcp_server
│   ├── __init__.py
│   ├── __main__.py
│   ├── cli
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── commands.py
│   │   ├── helpers.py
│   │   └── typer_cli.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── completion_client.py
│   │   └── rag_client.py
│   ├── config
│   │   └── examples
│   │       └── filesystem_config.yaml
│   ├── config.py
│   ├── constants.py
│   ├── core
│   │   ├── __init__.py
│   │   ├── evaluation
│   │   │   ├── base.py
│   │   │   └── evaluators.py
│   │   ├── providers
│   │   │   ├── __init__.py
│   │   │   ├── anthropic.py
│   │   │   ├── base.py
│   │   │   ├── deepseek.py
│   │   │   ├── gemini.py
│   │   │   ├── grok.py
│   │   │   ├── ollama.py
│   │   │   ├── openai.py
│   │   │   └── openrouter.py
│   │   ├── server.py
│   │   ├── state_store.py
│   │   ├── tournaments
│   │   │   ├── manager.py
│   │   │   ├── tasks.py
│   │   │   └── utils.py
│   │   └── ums_api
│   │       ├── __init__.py
│   │       ├── ums_database.py
│   │       ├── ums_endpoints.py
│   │       ├── ums_models.py
│   │       └── ums_services.py
│   ├── exceptions.py
│   ├── graceful_shutdown.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── analytics
│   │   │   ├── __init__.py
│   │   │   ├── metrics.py
│   │   │   └── reporting.py
│   │   ├── cache
│   │   │   ├── __init__.py
│   │   │   ├── cache_service.py
│   │   │   ├── persistence.py
│   │   │   ├── strategies.py
│   │   │   └── utils.py
│   │   ├── cache.py
│   │   ├── document.py
│   │   ├── knowledge_base
│   │   │   ├── __init__.py
│   │   │   ├── feedback.py
│   │   │   ├── manager.py
│   │   │   ├── rag_engine.py
│   │   │   ├── retriever.py
│   │   │   └── utils.py
│   │   ├── prompts
│   │   │   ├── __init__.py
│   │   │   ├── repository.py
│   │   │   └── templates.py
│   │   ├── prompts.py
│   │   └── vector
│   │       ├── __init__.py
│   │       ├── embeddings.py
│   │       └── vector_service.py
│   ├── tool_token_counter.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── audio_transcription.py
│   │   ├── base.py
│   │   ├── completion.py
│   │   ├── docstring_refiner.py
│   │   ├── document_conversion_and_processing.py
│   │   ├── enhanced-ums-lookbook.html
│   │   ├── entity_relation_graph.py
│   │   ├── excel_spreadsheet_automation.py
│   │   ├── extraction.py
│   │   ├── filesystem.py
│   │   ├── html_to_markdown.py
│   │   ├── local_text_tools.py
│   │   ├── marqo_fused_search.py
│   │   ├── meta_api_tool.py
│   │   ├── ocr_tools.py
│   │   ├── optimization.py
│   │   ├── provider.py
│   │   ├── pyodide_boot_template.html
│   │   ├── python_sandbox.py
│   │   ├── rag.py
│   │   ├── redline-compiled.css
│   │   ├── sentiment_analysis.py
│   │   ├── single_shot_synthesis.py
│   │   ├── smart_browser.py
│   │   ├── sql_databases.py
│   │   ├── text_classification.py
│   │   ├── text_redline_tools.py
│   │   ├── tournament.py
│   │   ├── ums_explorer.html
│   │   └── unified_memory_system.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── async_utils.py
│   │   ├── display.py
│   │   ├── logging
│   │   │   ├── __init__.py
│   │   │   ├── console.py
│   │   │   ├── emojis.py
│   │   │   ├── formatter.py
│   │   │   ├── logger.py
│   │   │   ├── panels.py
│   │   │   ├── progress.py
│   │   │   └── themes.py
│   │   ├── parse_yaml.py
│   │   ├── parsing.py
│   │   ├── security.py
│   │   └── text.py
│   └── working_memory_api.py
├── unified_memory_system_technical_analysis.md
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/storage/smart_browser_internal/readability.js:
--------------------------------------------------------------------------------

```javascript
   1 | /*
   2 |  * Copyright (c) 2010 Arc90 Inc
   3 |  *
   4 |  * Licensed under the Apache License, Version 2.0 (the "License");
   5 |  * you may not use this file except in compliance with the License.
   6 |  * You may obtain a copy of the License at
   7 |  *
   8 |  *     http://www.apache.org/licenses/LICENSE-2.0
   9 |  *
  10 |  * Unless required by applicable law or agreed to in writing, software
  11 |  * distributed under the License is distributed on an "AS IS" BASIS,
  12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 |  * See the License for the specific language governing permissions and
  14 |  * limitations under the License.
  15 |  */
  16 | 
  17 | /*
  18 |  * This code is heavily based on Arc90's readability.js (1.7.1) script
  19 |  * available at: http://code.google.com/p/arc90labs-readability
  20 |  */
  21 | 
  22 | /**
  23 |  * Public constructor.
  24 |  * @param {HTMLDocument} doc     The document to parse.
  25 |  * @param {Object}       options The options object.
  26 |  */
  27 | function Readability(doc, options) {
  28 |   // In some older versions, people passed a URI as the first argument. Cope:
  29 |   if (options && options.documentElement) {
  30 |     doc = options;
  31 |     options = arguments[2];
  32 |   } else if (!doc || !doc.documentElement) {
  33 |     throw new Error("First argument to Readability constructor should be a document object.");
  34 |   }
  35 |   options = options || {};
  36 | 
  37 |   this._doc = doc;
  38 |   this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
  39 |   this._articleTitle = null;
  40 |   this._articleByline = null;
  41 |   this._articleDir = null;
  42 |   this._articleSiteName = null;
  43 |   this._attempts = [];
  44 | 
  45 |   // Configurable options
  46 |   this._debug = !!options.debug;
  47 |   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  48 |   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  49 |   this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
  50 |   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
  51 |   this._keepClasses = !!options.keepClasses;
  52 |   this._serializer = options.serializer || function(el) {
  53 |     return el.innerHTML;
  54 |   };
  55 |   this._disableJSONLD = !!options.disableJSONLD;
  56 |   this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
  57 | 
  58 |   // Start with all flags set
  59 |   this._flags = this.FLAG_STRIP_UNLIKELYS |
  60 |                 this.FLAG_WEIGHT_CLASSES |
  61 |                 this.FLAG_CLEAN_CONDITIONALLY;
  62 | 
  63 | 
  64 |   // Control whether log messages are sent to the console
  65 |   if (this._debug) {
  66 |     let logNode = function(node) {
  67 |       if (node.nodeType == node.TEXT_NODE) {
  68 |         return `${node.nodeName} ("${node.textContent}")`;
  69 |       }
  70 |       let attrPairs = Array.from(node.attributes || [], function(attr) {
  71 |         return `${attr.name}="${attr.value}"`;
  72 |       }).join(" ");
  73 |       return `<${node.localName} ${attrPairs}>`;
  74 |     };
  75 |     this.log = function () {
  76 |       if (typeof console !== "undefined") {
  77 |         let args = Array.from(arguments, arg => {
  78 |           if (arg && arg.nodeType == this.ELEMENT_NODE) {
  79 |             return logNode(arg);
  80 |           }
  81 |           return arg;
  82 |         });
  83 |         args.unshift("Reader: (Readability)");
  84 |         console.log.apply(console, args);
  85 |       } else if (typeof dump !== "undefined") {
  86 |         /* global dump */
  87 |         var msg = Array.prototype.map.call(arguments, function(x) {
  88 |           return (x && x.nodeName) ? logNode(x) : x;
  89 |         }).join(" ");
  90 |         dump("Reader: (Readability) " + msg + "\n");
  91 |       }
  92 |     };
  93 |   } else {
  94 |     this.log = function () {};
  95 |   }
  96 | }
  97 | 
  98 | Readability.prototype = {
  99 |   FLAG_STRIP_UNLIKELYS: 0x1,
 100 |   FLAG_WEIGHT_CLASSES: 0x2,
 101 |   FLAG_CLEAN_CONDITIONALLY: 0x4,
 102 | 
 103 |   // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
 104 |   ELEMENT_NODE: 1,
 105 |   TEXT_NODE: 3,
 106 | 
 107 |   // Max number of nodes supported by this parser. Default: 0 (no limit)
 108 |   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 109 | 
 110 |   // The number of top candidates to consider when analysing how
 111 |   // tight the competition is among candidates.
 112 |   DEFAULT_N_TOP_CANDIDATES: 5,
 113 | 
 114 |   // Element tags to score by default.
 115 |   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 116 | 
 117 |   // The default number of chars an article must have in order to return a result
 118 |   DEFAULT_CHAR_THRESHOLD: 500,
 119 | 
 120 |   // All of the regular expressions in use within readability.
 121 |   // Defined up here so we don't instantiate them repeatedly in loops.
 122 |   REGEXPS: {
 123 |     // NOTE: These two regular expressions are duplicated in
 124 |     // Readability-readerable.js. Please keep both copies in sync.
 125 |     unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
 126 |     okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
 127 | 
 128 |     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
 129 |     negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
 130 |     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
 131 |     byline: /byline|author|dateline|writtenby|p-author/i,
 132 |     replaceFonts: /<(\/?)font[^>]*>/gi,
 133 |     normalize: /\s{2,}/g,
 134 |     videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
 135 |     shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
 136 |     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
 137 |     prevLink: /(prev|earl|old|new|<|«)/i,
 138 |     tokenize: /\W+/g,
 139 |     whitespace: /^\s*$/,
 140 |     hasContent: /\S$/,
 141 |     hashUrl: /^#.+/,
 142 |     srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
 143 |     b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
 144 |     // Commas as used in Latin, Sindhi, Chinese and various other scripts.
 145 |     // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
 146 |     commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
 147 |     // See: https://schema.org/Article
 148 |     jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
 149 |   },
 150 | 
 151 |   UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
 152 | 
 153 |   DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),
 154 | 
 155 |   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
 156 | 
 157 |   PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
 158 | 
 159 |   DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
 160 | 
 161 |   // The commented out elements qualify as phrasing content but tend to be
 162 |   // removed by readability when put into paragraphs, so we ignore them here.
 163 |   PHRASING_ELEMS: [
 164 |     // "CANVAS", "IFRAME", "SVG", "VIDEO",
 165 |     "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
 166 |     "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
 167 |     "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
 168 |     "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
 169 |     "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
 170 |   ],
 171 | 
 172 |   // These are the classes that readability sets itself.
 173 |   CLASSES_TO_PRESERVE: [ "page" ],
 174 | 
 175 |   // These are the list of HTML entities that need to be escaped.
 176 |   HTML_ESCAPE_MAP: {
 177 |     "lt": "<",
 178 |     "gt": ">",
 179 |     "amp": "&",
 180 |     "quot": '"',
 181 |     "apos": "'",
 182 |   },
 183 | 
 184 |   /**
 185 |    * Run any post-process modifications to article content as necessary.
 186 |    *
 187 |    * @param Element
 188 |    * @return void
 189 |   **/
 190 |   _postProcessContent: function(articleContent) {
 191 |     // Readability cannot open relative uris so we convert them to absolute uris.
 192 |     this._fixRelativeUris(articleContent);
 193 | 
 194 |     this._simplifyNestedElements(articleContent);
 195 | 
 196 |     if (!this._keepClasses) {
 197 |       // Remove classes.
 198 |       this._cleanClasses(articleContent);
 199 |     }
 200 |   },
 201 | 
 202 |   /**
 203 |    * Iterates over a NodeList, calls `filterFn` for each node and removes node
 204 |    * if function returned `true`.
 205 |    *
 206 |    * If function is not passed, removes all the nodes in node list.
 207 |    *
 208 |    * @param NodeList nodeList The nodes to operate on
 209 |    * @param Function filterFn the function to use as a filter
 210 |    * @return void
 211 |    */
 212 |   _removeNodes: function(nodeList, filterFn) {
 213 |     // Avoid ever operating on live node lists.
 214 |     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 215 |       throw new Error("Do not pass live node lists to _removeNodes");
 216 |     }
 217 |     for (var i = nodeList.length - 1; i >= 0; i--) {
 218 |       var node = nodeList[i];
 219 |       var parentNode = node.parentNode;
 220 |       if (parentNode) {
 221 |         if (!filterFn || filterFn.call(this, node, i, nodeList)) {
 222 |           parentNode.removeChild(node);
 223 |         }
 224 |       }
 225 |     }
 226 |   },
 227 | 
 228 |   /**
 229 |    * Iterates over a NodeList, and calls _setNodeTag for each node.
 230 |    *
 231 |    * @param NodeList nodeList The nodes to operate on
 232 |    * @param String newTagName the new tag name to use
 233 |    * @return void
 234 |    */
 235 |   _replaceNodeTags: function(nodeList, newTagName) {
 236 |     // Avoid ever operating on live node lists.
 237 |     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 238 |       throw new Error("Do not pass live node lists to _replaceNodeTags");
 239 |     }
 240 |     for (const node of nodeList) {
 241 |       this._setNodeTag(node, newTagName);
 242 |     }
 243 |   },
 244 | 
 245 |   /**
 246 |    * Iterate over a NodeList, which doesn't natively fully implement the Array
 247 |    * interface.
 248 |    *
 249 |    * For convenience, the current object context is applied to the provided
 250 |    * iterate function.
 251 |    *
 252 |    * @param  NodeList nodeList The NodeList.
 253 |    * @param  Function fn       The iterate function.
 254 |    * @return void
 255 |    */
 256 |   _forEachNode: function(nodeList, fn) {
 257 |     Array.prototype.forEach.call(nodeList, fn, this);
 258 |   },
 259 | 
 260 |   /**
 261 |    * Iterate over a NodeList, and return the first node that passes
 262 |    * the supplied test function
 263 |    *
 264 |    * For convenience, the current object context is applied to the provided
 265 |    * test function.
 266 |    *
 267 |    * @param  NodeList nodeList The NodeList.
 268 |    * @param  Function fn       The test function.
 269 |    * @return void
 270 |    */
 271 |   _findNode: function(nodeList, fn) {
 272 |     return Array.prototype.find.call(nodeList, fn, this);
 273 |   },
 274 | 
 275 |   /**
 276 |    * Iterate over a NodeList, return true if any of the provided iterate
 277 |    * function calls returns true, false otherwise.
 278 |    *
 279 |    * For convenience, the current object context is applied to the
 280 |    * provided iterate function.
 281 |    *
 282 |    * @param  NodeList nodeList The NodeList.
 283 |    * @param  Function fn       The iterate function.
 284 |    * @return Boolean
 285 |    */
 286 |   _someNode: function(nodeList, fn) {
 287 |     return Array.prototype.some.call(nodeList, fn, this);
 288 |   },
 289 | 
 290 |   /**
 291 |    * Iterate over a NodeList, return true if all of the provided iterate
 292 |    * function calls return true, false otherwise.
 293 |    *
 294 |    * For convenience, the current object context is applied to the
 295 |    * provided iterate function.
 296 |    *
 297 |    * @param  NodeList nodeList The NodeList.
 298 |    * @param  Function fn       The iterate function.
 299 |    * @return Boolean
 300 |    */
 301 |   _everyNode: function(nodeList, fn) {
 302 |     return Array.prototype.every.call(nodeList, fn, this);
 303 |   },
 304 | 
 305 |   /**
 306 |    * Concat all nodelists passed as arguments.
 307 |    *
 308 |    * @return ...NodeList
 309 |    * @return Array
 310 |    */
 311 |   _concatNodeLists: function() {
 312 |     var slice = Array.prototype.slice;
 313 |     var args = slice.call(arguments);
 314 |     var nodeLists = args.map(function(list) {
 315 |       return slice.call(list);
 316 |     });
 317 |     return Array.prototype.concat.apply([], nodeLists);
 318 |   },
 319 | 
 320 |   _getAllNodesWithTag: function(node, tagNames) {
 321 |     if (node.querySelectorAll) {
 322 |       return node.querySelectorAll(tagNames.join(","));
 323 |     }
 324 |     return [].concat.apply([], tagNames.map(function(tag) {
 325 |       var collection = node.getElementsByTagName(tag);
 326 |       return Array.isArray(collection) ? collection : Array.from(collection);
 327 |     }));
 328 |   },
 329 | 
 330 |   /**
 331 |    * Removes the class="" attribute from every element in the given
 332 |    * subtree, except those that match CLASSES_TO_PRESERVE and
 333 |    * the classesToPreserve array from the options object.
 334 |    *
 335 |    * @param Element
 336 |    * @return void
 337 |    */
 338 |   _cleanClasses: function(node) {
 339 |     var classesToPreserve = this._classesToPreserve;
 340 |     var className = (node.getAttribute("class") || "")
 341 |       .split(/\s+/)
 342 |       .filter(function(cls) {
 343 |         return classesToPreserve.indexOf(cls) != -1;
 344 |       })
 345 |       .join(" ");
 346 | 
 347 |     if (className) {
 348 |       node.setAttribute("class", className);
 349 |     } else {
 350 |       node.removeAttribute("class");
 351 |     }
 352 | 
 353 |     for (node = node.firstElementChild; node; node = node.nextElementSibling) {
 354 |       this._cleanClasses(node);
 355 |     }
 356 |   },
 357 | 
 358 |   /**
 359 |    * Converts each <a> and <img> uri in the given element to an absolute URI,
 360 |    * ignoring #ref URIs.
 361 |    *
 362 |    * @param Element
 363 |    * @return void
 364 |    */
 365 |   _fixRelativeUris: function(articleContent) {
 366 |     var baseURI = this._doc.baseURI;
 367 |     var documentURI = this._doc.documentURI;
 368 |     function toAbsoluteURI(uri) {
 369 |       // Leave hash links alone if the base URI matches the document URI:
 370 |       if (baseURI == documentURI && uri.charAt(0) == "#") {
 371 |         return uri;
 372 |       }
 373 | 
 374 |       // Otherwise, resolve against base URI:
 375 |       try {
 376 |         return new URL(uri, baseURI).href;
 377 |       } catch (ex) {
 378 |         // Something went wrong, just return the original:
 379 |       }
 380 |       return uri;
 381 |     }
 382 | 
 383 |     var links = this._getAllNodesWithTag(articleContent, ["a"]);
 384 |     this._forEachNode(links, function(link) {
 385 |       var href = link.getAttribute("href");
 386 |       if (href) {
 387 |         // Remove links with javascript: URIs, since
 388 |         // they won't work after scripts have been removed from the page.
 389 |         if (href.indexOf("javascript:") === 0) {
 390 |           // if the link only contains simple text content, it can be converted to a text node
 391 |           if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
 392 |             var text = this._doc.createTextNode(link.textContent);
 393 |             link.parentNode.replaceChild(text, link);
 394 |           } else {
 395 |             // if the link has multiple children, they should all be preserved
 396 |             var container = this._doc.createElement("span");
 397 |             while (link.firstChild) {
 398 |               container.appendChild(link.firstChild);
 399 |             }
 400 |             link.parentNode.replaceChild(container, link);
 401 |           }
 402 |         } else {
 403 |           link.setAttribute("href", toAbsoluteURI(href));
 404 |         }
 405 |       }
 406 |     });
 407 | 
 408 |     var medias = this._getAllNodesWithTag(articleContent, [
 409 |       "img", "picture", "figure", "video", "audio", "source"
 410 |     ]);
 411 | 
 412 |     this._forEachNode(medias, function(media) {
 413 |       var src = media.getAttribute("src");
 414 |       var poster = media.getAttribute("poster");
 415 |       var srcset = media.getAttribute("srcset");
 416 | 
 417 |       if (src) {
 418 |         media.setAttribute("src", toAbsoluteURI(src));
 419 |       }
 420 | 
 421 |       if (poster) {
 422 |         media.setAttribute("poster", toAbsoluteURI(poster));
 423 |       }
 424 | 
 425 |       if (srcset) {
 426 |         var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
 427 |           return toAbsoluteURI(p1) + (p2 || "") + p3;
 428 |         });
 429 | 
 430 |         media.setAttribute("srcset", newSrcset);
 431 |       }
 432 |     });
 433 |   },
 434 | 
 435 |   _simplifyNestedElements: function(articleContent) {
 436 |     var node = articleContent;
 437 | 
 438 |     while (node) {
 439 |       if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
 440 |         if (this._isElementWithoutContent(node)) {
 441 |           node = this._removeAndGetNext(node);
 442 |           continue;
 443 |         } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
 444 |           var child = node.children[0];
 445 |           for (var i = 0; i < node.attributes.length; i++) {
 446 |             child.setAttribute(node.attributes[i].name, node.attributes[i].value);
 447 |           }
 448 |           node.parentNode.replaceChild(child, node);
 449 |           node = child;
 450 |           continue;
 451 |         }
 452 |       }
 453 | 
 454 |       node = this._getNextNode(node);
 455 |     }
 456 |   },
 457 | 
 458 |   /**
 459 |    * Get the article title as an H1.
 460 |    *
 461 |    * @return string
 462 |    **/
 463 |   _getArticleTitle: function() {
 464 |     var doc = this._doc;
 465 |     var curTitle = "";
 466 |     var origTitle = "";
 467 | 
 468 |     try {
 469 |       curTitle = origTitle = doc.title.trim();
 470 | 
 471 |       // If they had an element with id "title" in their HTML
 472 |       if (typeof curTitle !== "string")
 473 |         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
 474 |     } catch (e) {/* ignore exceptions setting the title. */}
 475 | 
 476 |     var titleHadHierarchicalSeparators = false;
 477 |     function wordCount(str) {
 478 |       return str.split(/\s+/).length;
 479 |     }
 480 | 
 481 |     // If there's a separator in the title, first remove the final part
 482 |     if ((/ [\|\-\\\/>»] /).test(curTitle)) {
 483 |       titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
 484 |       curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
 485 | 
 486 |       // If the resulting title is too short (3 words or fewer), remove
 487 |       // the first part instead:
 488 |       if (wordCount(curTitle) < 3)
 489 |         curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
 490 |     } else if (curTitle.indexOf(": ") !== -1) {
 491 |       // Check if we have an heading containing this exact string, so we
 492 |       // could assume it's the full title.
 493 |       var headings = this._concatNodeLists(
 494 |         doc.getElementsByTagName("h1"),
 495 |         doc.getElementsByTagName("h2")
 496 |       );
 497 |       var trimmedTitle = curTitle.trim();
 498 |       var match = this._someNode(headings, function(heading) {
 499 |         return heading.textContent.trim() === trimmedTitle;
 500 |       });
 501 | 
 502 |       // If we don't, let's extract the title out of the original title string.
 503 |       if (!match) {
 504 |         curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
 505 | 
 506 |         // If the title is now too short, try the first colon instead:
 507 |         if (wordCount(curTitle) < 3) {
 508 |           curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
 509 |           // But if we have too many words before the colon there's something weird
 510 |           // with the titles and the H tags so let's just use the original title instead
 511 |         } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
 512 |           curTitle = origTitle;
 513 |         }
 514 |       }
 515 |     } else if (curTitle.length > 150 || curTitle.length < 15) {
 516 |       var hOnes = doc.getElementsByTagName("h1");
 517 | 
 518 |       if (hOnes.length === 1)
 519 |         curTitle = this._getInnerText(hOnes[0]);
 520 |     }
 521 | 
 522 |     curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
 523 |     // If we now have 4 words or fewer as our title, and either no
 524 |     // 'hierarchical' separators (\, /, > or ») were found in the original
 525 |     // title or we decreased the number of words by more than 1 word, use
 526 |     // the original title.
 527 |     var curTitleWordCount = wordCount(curTitle);
 528 |     if (curTitleWordCount <= 4 &&
 529 |         (!titleHadHierarchicalSeparators ||
 530 |          curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
 531 |       curTitle = origTitle;
 532 |     }
 533 | 
 534 |     return curTitle;
 535 |   },
 536 | 
 537 |   /**
 538 |    * Prepare the HTML document for readability to scrape it.
 539 |    * This includes things like stripping javascript, CSS, and handling terrible markup.
 540 |    *
 541 |    * @return void
 542 |    **/
 543 |   _prepDocument: function() {
 544 |     var doc = this._doc;
 545 | 
 546 |     // Remove all style tags in head
 547 |     this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
 548 | 
 549 |     if (doc.body) {
 550 |       this._replaceBrs(doc.body);
 551 |     }
 552 | 
 553 |     this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
 554 |   },
 555 | 
 556 |   /**
 557 |    * Finds the next node, starting from the given node, and ignoring
 558 |    * whitespace in between. If the given node is an element, the same node is
 559 |    * returned.
 560 |    */
 561 |   _nextNode: function (node) {
 562 |     var next = node;
 563 |     while (next
 564 |         && (next.nodeType != this.ELEMENT_NODE)
 565 |         && this.REGEXPS.whitespace.test(next.textContent)) {
 566 |       next = next.nextSibling;
 567 |     }
 568 |     return next;
 569 |   },
 570 | 
 571 |   /**
 572 |    * Replaces 2 or more successive <br> elements with a single <p>.
 573 |    * Whitespace between <br> elements are ignored. For example:
 574 |    *   <div>foo<br>bar<br> <br><br>abc</div>
 575 |    * will become:
 576 |    *   <div>foo<br>bar<p>abc</p></div>
 577 |    */
 578 |   _replaceBrs: function (elem) {
 579 |     this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
 580 |       var next = br.nextSibling;
 581 | 
 582 |       // Whether 2 or more <br> elements have been found and replaced with a
 583 |       // <p> block.
 584 |       var replaced = false;
 585 | 
 586 |       // If we find a <br> chain, remove the <br>s until we hit another node
 587 |       // or non-whitespace. This leaves behind the first <br> in the chain
 588 |       // (which will be replaced with a <p> later).
 589 |       while ((next = this._nextNode(next)) && (next.tagName == "BR")) {
 590 |         replaced = true;
 591 |         var brSibling = next.nextSibling;
 592 |         next.parentNode.removeChild(next);
 593 |         next = brSibling;
 594 |       }
 595 | 
 596 |       // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
 597 |       // all sibling nodes as children of the <p> until we hit another <br>
 598 |       // chain.
 599 |       if (replaced) {
 600 |         var p = this._doc.createElement("p");
 601 |         br.parentNode.replaceChild(p, br);
 602 | 
 603 |         next = p.nextSibling;
 604 |         while (next) {
 605 |           // If we've hit another <br><br>, we're done adding children to this <p>.
 606 |           if (next.tagName == "BR") {
 607 |             var nextElem = this._nextNode(next.nextSibling);
 608 |             if (nextElem && nextElem.tagName == "BR")
 609 |               break;
 610 |           }
 611 | 
 612 |           if (!this._isPhrasingContent(next))
 613 |             break;
 614 | 
 615 |           // Otherwise, make this node a child of the new <p>.
 616 |           var sibling = next.nextSibling;
 617 |           p.appendChild(next);
 618 |           next = sibling;
 619 |         }
 620 | 
 621 |         while (p.lastChild && this._isWhitespace(p.lastChild)) {
 622 |           p.removeChild(p.lastChild);
 623 |         }
 624 | 
 625 |         if (p.parentNode.tagName === "P")
 626 |           this._setNodeTag(p.parentNode, "DIV");
 627 |       }
 628 |     });
 629 |   },
 630 | 
 631 |   _setNodeTag: function (node, tag) {
 632 |     this.log("_setNodeTag", node, tag);
 633 |     if (this._docJSDOMParser) {
 634 |       node.localName = tag.toLowerCase();
 635 |       node.tagName = tag.toUpperCase();
 636 |       return node;
 637 |     }
 638 | 
 639 |     var replacement = node.ownerDocument.createElement(tag);
 640 |     while (node.firstChild) {
 641 |       replacement.appendChild(node.firstChild);
 642 |     }
 643 |     node.parentNode.replaceChild(replacement, node);
 644 |     if (node.readability)
 645 |       replacement.readability = node.readability;
 646 | 
 647 |     for (var i = 0; i < node.attributes.length; i++) {
 648 |       try {
 649 |         replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
 650 |       } catch (ex) {
 651 |         /* it's possible for setAttribute() to throw if the attribute name
 652 |          * isn't a valid XML Name. Such attributes can however be parsed from
 653 |          * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
 654 |          * so we can hit them here and then throw. We don't care about such
 655 |          * attributes so we ignore them.
 656 |          */
 657 |       }
 658 |     }
 659 |     return replacement;
 660 |   },
 661 | 
 662 |   /**
 663 |    * Prepare the article node for display. Clean out any inline styles,
 664 |    * iframes, forms, strip extraneous <p> tags, etc.
 665 |    *
 666 |    * @param Element
 667 |    * @return void
 668 |    **/
 669 |   _prepArticle: function(articleContent) {
 670 |     this._cleanStyles(articleContent);
 671 | 
 672 |     // Check for data tables before we continue, to avoid removing items in
 673 |     // those tables, which will often be isolated even though they're
 674 |     // visually linked to other content-ful elements (text, images, etc.).
 675 |     this._markDataTables(articleContent);
 676 | 
 677 |     this._fixLazyImages(articleContent);
 678 | 
 679 |     // Clean out junk from the article content
 680 |     this._cleanConditionally(articleContent, "form");
 681 |     this._cleanConditionally(articleContent, "fieldset");
 682 |     this._clean(articleContent, "object");
 683 |     this._clean(articleContent, "embed");
 684 |     this._clean(articleContent, "footer");
 685 |     this._clean(articleContent, "link");
 686 |     this._clean(articleContent, "aside");
 687 | 
 688 |     // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
 689 |     // which means we don't remove the top candidates even they have "share".
 690 | 
 691 |     var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
 692 | 
 693 |     this._forEachNode(articleContent.children, function (topCandidate) {
 694 |       this._cleanMatchedNodes(topCandidate, function (node, matchString) {
 695 |         return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
 696 |       });
 697 |     });
 698 | 
 699 |     this._clean(articleContent, "iframe");
 700 |     this._clean(articleContent, "input");
 701 |     this._clean(articleContent, "textarea");
 702 |     this._clean(articleContent, "select");
 703 |     this._clean(articleContent, "button");
 704 |     this._cleanHeaders(articleContent);
 705 | 
 706 |     // Do these last as the previous stuff may have removed junk
 707 |     // that will affect these
 708 |     this._cleanConditionally(articleContent, "table");
 709 |     this._cleanConditionally(articleContent, "ul");
 710 |     this._cleanConditionally(articleContent, "div");
 711 | 
 712 |     // replace H1 with H2 as H1 should be only title that is displayed separately
 713 |     this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");
 714 | 
 715 |     // Remove extra paragraphs
 716 |     this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
 717 |       var imgCount = paragraph.getElementsByTagName("img").length;
 718 |       var embedCount = paragraph.getElementsByTagName("embed").length;
 719 |       var objectCount = paragraph.getElementsByTagName("object").length;
 720 |       // At this point, nasty iframes have been removed, only remain embedded video ones.
 721 |       var iframeCount = paragraph.getElementsByTagName("iframe").length;
 722 |       var totalCount = imgCount + embedCount + objectCount + iframeCount;
 723 | 
 724 |       return totalCount === 0 && !this._getInnerText(paragraph, false);
 725 |     });
 726 | 
 727 |     this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
 728 |       var next = this._nextNode(br.nextSibling);
 729 |       if (next && next.tagName == "P")
 730 |         br.parentNode.removeChild(br);
 731 |     });
 732 | 
 733 |     // Remove single-cell tables
 734 |     this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
 735 |       var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
 736 |       if (this._hasSingleTagInsideElement(tbody, "TR")) {
 737 |         var row = tbody.firstElementChild;
 738 |         if (this._hasSingleTagInsideElement(row, "TD")) {
 739 |           var cell = row.firstElementChild;
 740 |           cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
 741 |           table.parentNode.replaceChild(cell, table);
 742 |         }
 743 |       }
 744 |     });
 745 |   },
 746 | 
 747 |   /**
 748 |    * Initialize a node with the readability object. Also checks the
 749 |    * className/id for special names to add to its score.
 750 |    *
 751 |    * @param Element
 752 |    * @return void
 753 |   **/
 754 |   _initializeNode: function(node) {
 755 |     node.readability = {"contentScore": 0};
 756 | 
 757 |     switch (node.tagName) {
 758 |       case "DIV":
 759 |         node.readability.contentScore += 5;
 760 |         break;
 761 | 
 762 |       case "PRE":
 763 |       case "TD":
 764 |       case "BLOCKQUOTE":
 765 |         node.readability.contentScore += 3;
 766 |         break;
 767 | 
 768 |       case "ADDRESS":
 769 |       case "OL":
 770 |       case "UL":
 771 |       case "DL":
 772 |       case "DD":
 773 |       case "DT":
 774 |       case "LI":
 775 |       case "FORM":
 776 |         node.readability.contentScore -= 3;
 777 |         break;
 778 | 
 779 |       case "H1":
 780 |       case "H2":
 781 |       case "H3":
 782 |       case "H4":
 783 |       case "H5":
 784 |       case "H6":
 785 |       case "TH":
 786 |         node.readability.contentScore -= 5;
 787 |         break;
 788 |     }
 789 | 
 790 |     node.readability.contentScore += this._getClassWeight(node);
 791 |   },
 792 | 
 793 |   _removeAndGetNext: function(node) {
 794 |     var nextNode = this._getNextNode(node, true);
 795 |     node.parentNode.removeChild(node);
 796 |     return nextNode;
 797 |   },
 798 | 
 799 |   /**
 800 |    * Traverse the DOM from node to node, starting at the node passed in.
 801 |    * Pass true for the second parameter to indicate this node itself
 802 |    * (and its kids) are going away, and we want the next node over.
 803 |    *
 804 |    * Calling this in a loop will traverse the DOM depth-first.
 805 |    */
 806 |   _getNextNode: function(node, ignoreSelfAndKids) {
 807 |     // First check for kids if those aren't being ignored
 808 |     if (!ignoreSelfAndKids && node.firstElementChild) {
 809 |       return node.firstElementChild;
 810 |     }
 811 |     // Then for siblings...
 812 |     if (node.nextElementSibling) {
 813 |       return node.nextElementSibling;
 814 |     }
 815 |     // And finally, move up the parent chain *and* find a sibling
 816 |     // (because this is depth-first traversal, we will have already
 817 |     // seen the parent nodes themselves).
 818 |     do {
 819 |       node = node.parentNode;
 820 |     } while (node && !node.nextElementSibling);
 821 |     return node && node.nextElementSibling;
 822 |   },
 823 | 
 824 |   // compares second text to first one
 825 |   // 1 = same text, 0 = completely different text
 826 |   // works the way that it splits both texts into words and then finds words that are unique in second text
 827 |   // the result is given by the lower length of unique parts
 828 |   _textSimilarity: function(textA, textB) {
 829 |     var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
 830 |     var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
 831 |     if (!tokensA.length || !tokensB.length) {
 832 |       return 0;
 833 |     }
 834 |     var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
 835 |     var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
 836 |     return 1 - distanceB;
 837 |   },
 838 | 
 839 |   _checkByline: function(node, matchString) {
 840 |     if (this._articleByline) {
 841 |       return false;
 842 |     }
 843 | 
 844 |     if (node.getAttribute !== undefined) {
 845 |       var rel = node.getAttribute("rel");
 846 |       var itemprop = node.getAttribute("itemprop");
 847 |     }
 848 | 
 849 |     if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
 850 |       this._articleByline = node.textContent.trim();
 851 |       return true;
 852 |     }
 853 | 
 854 |     return false;
 855 |   },
 856 | 
 857 |   _getNodeAncestors: function(node, maxDepth) {
 858 |     maxDepth = maxDepth || 0;
 859 |     var i = 0, ancestors = [];
 860 |     while (node.parentNode) {
 861 |       ancestors.push(node.parentNode);
 862 |       if (maxDepth && ++i === maxDepth)
 863 |         break;
 864 |       node = node.parentNode;
 865 |     }
 866 |     return ancestors;
 867 |   },
 868 | 
 869 |   /***
 870 |    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
 871 |    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
 872 |    *
 873 |    * @param page a document to run upon. Needs to be a full document, complete with body.
 874 |    * @return Element
 875 |   **/
 876 |   _grabArticle: function (page) {
 877 |     this.log("**** grabArticle ****");
 878 |     var doc = this._doc;
 879 |     var isPaging = page !== null;
 880 |     page = page ? page : this._doc.body;
 881 | 
 882 |     // We can't grab an article if we don't have a page!
 883 |     if (!page) {
 884 |       this.log("No body found in document. Abort.");
 885 |       return null;
 886 |     }
 887 | 
 888 |     var pageCacheHtml = page.innerHTML;
 889 | 
 890 |     while (true) {
 891 |       this.log("Starting grabArticle loop");
 892 |       var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
 893 | 
 894 |       // First, node prepping. Trash nodes that look cruddy (like ones with the
 895 |       // class name "comment", etc), and turn divs into P tags where they have been
 896 |       // used inappropriately (as in, where they contain no other block level elements.)
 897 |       var elementsToScore = [];
 898 |       var node = this._doc.documentElement;
 899 | 
 900 |       let shouldRemoveTitleHeader = true;
 901 | 
 902 |       while (node) {
 903 | 
 904 |         if (node.tagName === "HTML") {
 905 |           this._articleLang = node.getAttribute("lang");
 906 |         }
 907 | 
 908 |         var matchString = node.className + " " + node.id;
 909 | 
 910 |         if (!this._isProbablyVisible(node)) {
 911 |           this.log("Removing hidden node - " + matchString);
 912 |           node = this._removeAndGetNext(node);
 913 |           continue;
 914 |         }
 915 | 
 916 |         // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
 917 |         if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") {
 918 |           node = this._removeAndGetNext(node);
 919 |           continue;
 920 |         }
 921 | 
 922 |         // Check to see if this node is a byline, and remove it if it is.
 923 |         if (this._checkByline(node, matchString)) {
 924 |           node = this._removeAndGetNext(node);
 925 |           continue;
 926 |         }
 927 | 
 928 |         if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
 929 |           this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim());
 930 |           shouldRemoveTitleHeader = false;
 931 |           node = this._removeAndGetNext(node);
 932 |           continue;
 933 |         }
 934 | 
 935 |         // Remove unlikely candidates
 936 |         if (stripUnlikelyCandidates) {
 937 |           if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
 938 |               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
 939 |               !this._hasAncestorTag(node, "table") &&
 940 |               !this._hasAncestorTag(node, "code") &&
 941 |               node.tagName !== "BODY" &&
 942 |               node.tagName !== "A") {
 943 |             this.log("Removing unlikely candidate - " + matchString);
 944 |             node = this._removeAndGetNext(node);
 945 |             continue;
 946 |           }
 947 | 
 948 |           if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
 949 |             this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString);
 950 |             node = this._removeAndGetNext(node);
 951 |             continue;
 952 |           }
 953 |         }
 954 | 
 955 |         // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
 956 |         if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
 957 |              node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
 958 |              node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
 959 |             this._isElementWithoutContent(node)) {
 960 |           node = this._removeAndGetNext(node);
 961 |           continue;
 962 |         }
 963 | 
 964 |         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
 965 |           elementsToScore.push(node);
 966 |         }
 967 | 
 968 |         // Turn all divs that don't have children block level elements into p's
 969 |         if (node.tagName === "DIV") {
 970 |           // Put phrasing content into paragraphs.
 971 |           var p = null;
 972 |           var childNode = node.firstChild;
 973 |           while (childNode) {
 974 |             var nextSibling = childNode.nextSibling;
 975 |             if (this._isPhrasingContent(childNode)) {
 976 |               if (p !== null) {
 977 |                 p.appendChild(childNode);
 978 |               } else if (!this._isWhitespace(childNode)) {
 979 |                 p = doc.createElement("p");
 980 |                 node.replaceChild(p, childNode);
 981 |                 p.appendChild(childNode);
 982 |               }
 983 |             } else if (p !== null) {
 984 |               while (p.lastChild && this._isWhitespace(p.lastChild)) {
 985 |                 p.removeChild(p.lastChild);
 986 |               }
 987 |               p = null;
 988 |             }
 989 |             childNode = nextSibling;
 990 |           }
 991 | 
 992 |           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
 993 |           // element. DIVs with only a P element inside and no text content can be
 994 |           // safely converted into plain P elements to avoid confusing the scoring
 995 |           // algorithm with DIVs with are, in practice, paragraphs.
 996 |           if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
 997 |             var newNode = node.children[0];
 998 |             node.parentNode.replaceChild(newNode, node);
 999 |             node = newNode;
1000 |             elementsToScore.push(node);
1001 |           } else if (!this._hasChildBlockElement(node)) {
1002 |             node = this._setNodeTag(node, "P");
1003 |             elementsToScore.push(node);
1004 |           }
1005 |         }
1006 |         node = this._getNextNode(node);
1007 |       }
1008 | 
1009 |       /**
1010 |        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1011 |        * Then add their score to their parent node.
1012 |        *
1013 |        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1014 |       **/
1015 |       var candidates = [];
1016 |       this._forEachNode(elementsToScore, function(elementToScore) {
1017 |         if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
1018 |           return;
1019 | 
1020 |         // If this paragraph is less than 25 characters, don't even count it.
1021 |         var innerText = this._getInnerText(elementToScore);
1022 |         if (innerText.length < 25)
1023 |           return;
1024 | 
1025 |         // Exclude nodes with no ancestor.
1026 |         var ancestors = this._getNodeAncestors(elementToScore, 5);
1027 |         if (ancestors.length === 0)
1028 |           return;
1029 | 
1030 |         var contentScore = 0;
1031 | 
1032 |         // Add a point for the paragraph itself as a base.
1033 |         contentScore += 1;
1034 | 
1035 |         // Add points for any commas within this paragraph.
1036 |         contentScore += innerText.split(this.REGEXPS.commas).length;
1037 | 
1038 |         // For every 100 characters in this paragraph, add another point. Up to 3 points.
1039 |         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1040 | 
1041 |         // Initialize and score ancestors.
1042 |         this._forEachNode(ancestors, function(ancestor, level) {
1043 |           if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
1044 |             return;
1045 | 
1046 |           if (typeof(ancestor.readability) === "undefined") {
1047 |             this._initializeNode(ancestor);
1048 |             candidates.push(ancestor);
1049 |           }
1050 | 
1051 |           // Node score divider:
1052 |           // - parent:             1 (no division)
1053 |           // - grandparent:        2
1054 |           // - great grandparent+: ancestor level * 3
1055 |           if (level === 0)
1056 |             var scoreDivider = 1;
1057 |           else if (level === 1)
1058 |             scoreDivider = 2;
1059 |           else
1060 |             scoreDivider = level * 3;
1061 |           ancestor.readability.contentScore += contentScore / scoreDivider;
1062 |         });
1063 |       });
1064 | 
1065 |       // After we've calculated scores, loop through all of the possible
1066 |       // candidate nodes we found and find the one with the highest score.
1067 |       var topCandidates = [];
1068 |       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1069 |         var candidate = candidates[c];
1070 | 
1071 |         // Scale the final candidates score based on link density. Good content
1072 |         // should have a relatively small link density (5% or less) and be mostly
1073 |         // unaffected by this operation.
1074 |         var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
1075 |         candidate.readability.contentScore = candidateScore;
1076 | 
1077 |         this.log("Candidate:", candidate, "with score " + candidateScore);
1078 | 
1079 |         for (var t = 0; t < this._nbTopCandidates; t++) {
1080 |           var aTopCandidate = topCandidates[t];
1081 | 
1082 |           if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
1083 |             topCandidates.splice(t, 0, candidate);
1084 |             if (topCandidates.length > this._nbTopCandidates)
1085 |               topCandidates.pop();
1086 |             break;
1087 |           }
1088 |         }
1089 |       }
1090 | 
1091 |       var topCandidate = topCandidates[0] || null;
1092 |       var neededToCreateTopCandidate = false;
1093 |       var parentOfTopCandidate;
1094 | 
1095 |       // If we still have no top candidate, just use the body as a last resort.
1096 |       // We also have to copy the body node so it is something we can modify.
1097 |       if (topCandidate === null || topCandidate.tagName === "BODY") {
1098 |         // Move all of the page's children into topCandidate
1099 |         topCandidate = doc.createElement("DIV");
1100 |         neededToCreateTopCandidate = true;
1101 |         // Move everything (not just elements, also text nodes etc.) into the container
1102 |         // so we even include text directly in the body:
1103 |         while (page.firstChild) {
1104 |           this.log("Moving child out:", page.firstChild);
1105 |           topCandidate.appendChild(page.firstChild);
1106 |         }
1107 | 
1108 |         page.appendChild(topCandidate);
1109 | 
1110 |         this._initializeNode(topCandidate);
1111 |       } else if (topCandidate) {
1112 |         // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1113 |         // and whose scores are quite closed with current `topCandidate` node.
1114 |         var alternativeCandidateAncestors = [];
1115 |         for (var i = 1; i < topCandidates.length; i++) {
1116 |           if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
1117 |             alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
1118 |           }
1119 |         }
1120 |         var MINIMUM_TOPCANDIDATES = 3;
1121 |         if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1122 |           parentOfTopCandidate = topCandidate.parentNode;
1123 |           while (parentOfTopCandidate.tagName !== "BODY") {
1124 |             var listsContainingThisAncestor = 0;
1125 |             for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
1126 |               listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
1127 |             }
1128 |             if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1129 |               topCandidate = parentOfTopCandidate;
1130 |               break;
1131 |             }
1132 |             parentOfTopCandidate = parentOfTopCandidate.parentNode;
1133 |           }
1134 |         }
1135 |         if (!topCandidate.readability) {
1136 |           this._initializeNode(topCandidate);
1137 |         }
1138 | 
1139 |         // Because of our bonus system, parents of candidates might have scores
1140 |         // themselves. They get half of the node. There won't be nodes with higher
1141 |         // scores than our topCandidate, but if we see the score going *up* in the first
1142 |         // few steps up the tree, that's a decent sign that there might be more content
1143 |         // lurking in other places that we want to unify in. The sibling stuff
1144 |         // below does some of that - but only if we've looked high enough up the DOM
1145 |         // tree.
1146 |         parentOfTopCandidate = topCandidate.parentNode;
1147 |         var lastScore = topCandidate.readability.contentScore;
1148 |         // The scores shouldn't get too low.
1149 |         var scoreThreshold = lastScore / 3;
1150 |         while (parentOfTopCandidate.tagName !== "BODY") {
1151 |           if (!parentOfTopCandidate.readability) {
1152 |             parentOfTopCandidate = parentOfTopCandidate.parentNode;
1153 |             continue;
1154 |           }
1155 |           var parentScore = parentOfTopCandidate.readability.contentScore;
1156 |           if (parentScore < scoreThreshold)
1157 |             break;
1158 |           if (parentScore > lastScore) {
1159 |             // Alright! We found a better parent to use.
1160 |             topCandidate = parentOfTopCandidate;
1161 |             break;
1162 |           }
1163 |           lastScore = parentOfTopCandidate.readability.contentScore;
1164 |           parentOfTopCandidate = parentOfTopCandidate.parentNode;
1165 |         }
1166 | 
1167 |         // If the top candidate is the only child, use parent instead. This will help sibling
1168 |         // joining logic when adjacent content is actually located in parent's sibling node.
1169 |         parentOfTopCandidate = topCandidate.parentNode;
1170 |         while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
1171 |           topCandidate = parentOfTopCandidate;
1172 |           parentOfTopCandidate = topCandidate.parentNode;
1173 |         }
1174 |         if (!topCandidate.readability) {
1175 |           this._initializeNode(topCandidate);
1176 |         }
1177 |       }
1178 | 
1179 |       // Now that we have the top candidate, look through its siblings for content
1180 |       // that might also be related. Things like preambles, content split by ads
1181 |       // that we removed, etc.
1182 |       var articleContent = doc.createElement("DIV");
1183 |       if (isPaging)
1184 |         articleContent.id = "readability-content";
1185 | 
1186 |       var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
1187 |       // Keep potential top candidate's parent node to try to get text direction of it later.
1188 |       parentOfTopCandidate = topCandidate.parentNode;
1189 |       var siblings = parentOfTopCandidate.children;
1190 | 
1191 |       for (var s = 0, sl = siblings.length; s < sl; s++) {
1192 |         var sibling = siblings[s];
1193 |         var append = false;
1194 | 
1195 |         this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
1196 |         this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
1197 | 
1198 |         if (sibling === topCandidate) {
1199 |           append = true;
1200 |         } else {
1201 |           var contentBonus = 0;
1202 | 
1203 |           // Give a bonus if sibling nodes and top candidates have the example same classname
1204 |           if (sibling.className === topCandidate.className && topCandidate.className !== "")
1205 |             contentBonus += topCandidate.readability.contentScore * 0.2;
1206 | 
1207 |           if (sibling.readability &&
1208 |               ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
1209 |             append = true;
1210 |           } else if (sibling.nodeName === "P") {
1211 |             var linkDensity = this._getLinkDensity(sibling);
1212 |             var nodeContent = this._getInnerText(sibling);
1213 |             var nodeLength = nodeContent.length;
1214 | 
1215 |             if (nodeLength > 80 && linkDensity < 0.25) {
1216 |               append = true;
1217 |             } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
1218 |                        nodeContent.search(/\.( |$)/) !== -1) {
1219 |               append = true;
1220 |             }
1221 |           }
1222 |         }
1223 | 
1224 |         if (append) {
1225 |           this.log("Appending node:", sibling);
1226 | 
1227 |           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
1228 |             // We have a node that isn't a common block level element, like a form or td tag.
1229 |             // Turn it into a div so it doesn't get filtered out later by accident.
1230 |             this.log("Altering sibling:", sibling, "to div.");
1231 | 
1232 |             sibling = this._setNodeTag(sibling, "DIV");
1233 |           }
1234 | 
1235 |           articleContent.appendChild(sibling);
1236 |           // Fetch children again to make it compatible
1237 |           // with DOM parsers without live collection support.
1238 |           siblings = parentOfTopCandidate.children;
1239 |           // siblings is a reference to the children array, and
1240 |           // sibling is removed from the array when we call appendChild().
1241 |           // As a result, we must revisit this index since the nodes
1242 |           // have been shifted.
1243 |           s -= 1;
1244 |           sl -= 1;
1245 |         }
1246 |       }
1247 | 
1248 |       if (this._debug)
1249 |         this.log("Article content pre-prep: " + articleContent.innerHTML);
1250 |       // So we have all of the content that we need. Now we clean it up for presentation.
1251 |       this._prepArticle(articleContent);
1252 |       if (this._debug)
1253 |         this.log("Article content post-prep: " + articleContent.innerHTML);
1254 | 
1255 |       if (neededToCreateTopCandidate) {
1256 |         // We already created a fake div thing, and there wouldn't have been any siblings left
1257 |         // for the previous loop, so there's no point trying to create a new div, and then
1258 |         // move all the children over. Just assign IDs and class names here. No need to append
1259 |         // because that already happened anyway.
1260 |         topCandidate.id = "readability-page-1";
1261 |         topCandidate.className = "page";
1262 |       } else {
1263 |         var div = doc.createElement("DIV");
1264 |         div.id = "readability-page-1";
1265 |         div.className = "page";
1266 |         while (articleContent.firstChild) {
1267 |           div.appendChild(articleContent.firstChild);
1268 |         }
1269 |         articleContent.appendChild(div);
1270 |       }
1271 | 
1272 |       if (this._debug)
1273 |         this.log("Article content after paging: " + articleContent.innerHTML);
1274 | 
1275 |       var parseSuccessful = true;
1276 | 
1277 |       // Now that we've gone through the full algorithm, check to see if
1278 |       // we got any meaningful content. If we didn't, we may need to re-run
1279 |       // grabArticle with different flags set. This gives us a higher likelihood of
1280 |       // finding the content, and the sieve approach gives us a higher likelihood of
1281 |       // finding the -right- content.
1282 |       var textLength = this._getInnerText(articleContent, true).length;
1283 |       if (textLength < this._charThreshold) {
1284 |         parseSuccessful = false;
1285 |         page.innerHTML = pageCacheHtml;
1286 | 
1287 |         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1288 |           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1289 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1290 |         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1291 |           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1292 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1293 |         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1294 |           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1295 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1296 |         } else {
1297 |           this._attempts.push({articleContent: articleContent, textLength: textLength});
1298 |           // No luck after removing flags, just return the longest text we found during the different loops
1299 |           this._attempts.sort(function (a, b) {
1300 |             return b.textLength - a.textLength;
1301 |           });
1302 | 
1303 |           // But first check if we actually have something
1304 |           if (!this._attempts[0].textLength) {
1305 |             return null;
1306 |           }
1307 | 
1308 |           articleContent = this._attempts[0].articleContent;
1309 |           parseSuccessful = true;
1310 |         }
1311 |       }
1312 | 
1313 |       if (parseSuccessful) {
1314 |         // Find out text direction from ancestors of final top candidate.
1315 |         var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
1316 |         this._someNode(ancestors, function(ancestor) {
1317 |           if (!ancestor.tagName)
1318 |             return false;
1319 |           var articleDir = ancestor.getAttribute("dir");
1320 |           if (articleDir) {
1321 |             this._articleDir = articleDir;
1322 |             return true;
1323 |           }
1324 |           return false;
1325 |         });
1326 |         return articleContent;
1327 |       }
1328 |     }
1329 |   },
1330 | 
1331 |   /**
1332 |    * Check whether the input string could be a byline.
1333 |    * This verifies that the input is a string, and that the length
1334 |    * is less than 100 chars.
1335 |    *
1336 |    * @param possibleByline {string} - a string to check whether its a byline.
1337 |    * @return Boolean - whether the input string is a byline.
1338 |    */
1339 |   _isValidByline: function(byline) {
1340 |     if (typeof byline == "string" || byline instanceof String) {
1341 |       byline = byline.trim();
1342 |       return (byline.length > 0) && (byline.length < 100);
1343 |     }
1344 |     return false;
1345 |   },
1346 | 
1347 |   /**
1348 |    * Converts some of the common HTML entities in string to their corresponding characters.
1349 |    *
1350 |    * @param str {string} - a string to unescape.
1351 |    * @return string without HTML entity.
1352 |    */
1353 |   _unescapeHtmlEntities: function(str) {
1354 |     if (!str) {
1355 |       return str;
1356 |     }
1357 | 
1358 |     var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1359 |     return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
1360 |       return htmlEscapeMap[tag];
1361 |     }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
1362 |       var num = parseInt(hex || numStr, hex ? 16 : 10);
1363 |       return String.fromCharCode(num);
1364 |     });
1365 |   },
1366 | 
1367 |   /**
1368 |    * Try to extract metadata from JSON-LD object.
1369 |    * For now, only Schema.org objects of type Article or its subtypes are supported.
1370 |    * @return Object with any metadata that could be extracted (possibly none)
1371 |    */
1372 |   _getJSONLD: function (doc) {
1373 |     var scripts = this._getAllNodesWithTag(doc, ["script"]);
1374 | 
1375 |     var metadata;
1376 | 
1377 |     this._forEachNode(scripts, function(jsonLdElement) {
1378 |       if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") {
1379 |         try {
1380 |           // Strip CDATA markers if present
1381 |           var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
1382 |           var parsed = JSON.parse(content);
1383 |           if (
1384 |             !parsed["@context"] ||
1385 |             !parsed["@context"].match(/^https?\:\/\/schema\.org$/)
1386 |           ) {
1387 |             return;
1388 |           }
1389 | 
1390 |           if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
1391 |             parsed = parsed["@graph"].find(function(it) {
1392 |               return (it["@type"] || "").match(
1393 |                 this.REGEXPS.jsonLdArticleTypes
1394 |               );
1395 |             });
1396 |           }
1397 | 
1398 |           if (
1399 |             !parsed ||
1400 |             !parsed["@type"] ||
1401 |             !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
1402 |           ) {
1403 |             return;
1404 |           }
1405 | 
1406 |           metadata = {};
1407 | 
1408 |           if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
1409 |             // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1410 |             // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1411 |             // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1412 | 
1413 |             var title = this._getArticleTitle();
1414 |             var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1415 |             var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1416 | 
1417 |             if (headlineMatches && !nameMatches) {
1418 |               metadata.title = parsed.headline;
1419 |             } else {
1420 |               metadata.title = parsed.name;
1421 |             }
1422 |           } else if (typeof parsed.name === "string") {
1423 |             metadata.title = parsed.name.trim();
1424 |           } else if (typeof parsed.headline === "string") {
1425 |             metadata.title = parsed.headline.trim();
1426 |           }
1427 |           if (parsed.author) {
1428 |             if (typeof parsed.author.name === "string") {
1429 |               metadata.byline = parsed.author.name.trim();
1430 |             } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
1431 |               metadata.byline = parsed.author
1432 |                 .filter(function(author) {
1433 |                   return author && typeof author.name === "string";
1434 |                 })
1435 |                 .map(function(author) {
1436 |                   return author.name.trim();
1437 |                 })
1438 |                 .join(", ");
1439 |             }
1440 |           }
1441 |           if (typeof parsed.description === "string") {
1442 |             metadata.excerpt = parsed.description.trim();
1443 |           }
1444 |           if (
1445 |             parsed.publisher &&
1446 |             typeof parsed.publisher.name === "string"
1447 |           ) {
1448 |             metadata.siteName = parsed.publisher.name.trim();
1449 |           }
1450 |           if (typeof parsed.datePublished === "string") {
1451 |             metadata.datePublished = parsed.datePublished.trim();
1452 |           }
1453 |           return;
1454 |         } catch (err) {
1455 |           this.log(err.message);
1456 |         }
1457 |       }
1458 |     });
1459 |     return metadata ? metadata : {};
1460 |   },
1461 | 
1462 |   /**
1463 |    * Attempts to get excerpt and byline metadata for the article.
1464 |    *
1465 |    * @param {Object} jsonld — object containing any metadata that
1466 |    * could be extracted from JSON-LD object.
1467 |    *
1468 |    * @return Object with optional "excerpt" and "byline" properties
1469 |    */
1470 |   _getArticleMetadata: function(jsonld) {
1471 |     var metadata = {};
1472 |     var values = {};
1473 |     var metaElements = this._doc.getElementsByTagName("meta");
1474 | 
1475 |     // property is a space-separated list of values
1476 |     var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
1477 | 
1478 |     // name is a single value
1479 |     var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
1480 | 
1481 |     // Find description tags.
1482 |     this._forEachNode(metaElements, function(element) {
1483 |       var elementName = element.getAttribute("name");
1484 |       var elementProperty = element.getAttribute("property");
1485 |       var content = element.getAttribute("content");
1486 |       if (!content) {
1487 |         return;
1488 |       }
1489 |       var matches = null;
1490 |       var name = null;
1491 | 
1492 |       if (elementProperty) {
1493 |         matches = elementProperty.match(propertyPattern);
1494 |         if (matches) {
1495 |           // Convert to lowercase, and remove any whitespace
1496 |           // so we can match below.
1497 |           name = matches[0].toLowerCase().replace(/\s/g, "");
1498 |           // multiple authors
1499 |           values[name] = content.trim();
1500 |         }
1501 |       }
1502 |       if (!matches && elementName && namePattern.test(elementName)) {
1503 |         name = elementName;
1504 |         if (content) {
1505 |           // Convert to lowercase, remove any whitespace, and convert dots
1506 |           // to colons so we can match below.
1507 |           name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
1508 |           values[name] = content.trim();
1509 |         }
1510 |       }
1511 |     });
1512 | 
1513 |     // get title
1514 |     metadata.title = jsonld.title ||
1515 |                      values["dc:title"] ||
1516 |                      values["dcterm:title"] ||
1517 |                      values["og:title"] ||
1518 |                      values["weibo:article:title"] ||
1519 |                      values["weibo:webpage:title"] ||
1520 |                      values["title"] ||
1521 |                      values["twitter:title"];
1522 | 
1523 |     if (!metadata.title) {
1524 |       metadata.title = this._getArticleTitle();
1525 |     }
1526 | 
1527 |     // get author
1528 |     metadata.byline = jsonld.byline ||
1529 |                       values["dc:creator"] ||
1530 |                       values["dcterm:creator"] ||
1531 |                       values["author"];
1532 | 
1533 |     // get description
1534 |     metadata.excerpt = jsonld.excerpt ||
1535 |                        values["dc:description"] ||
1536 |                        values["dcterm:description"] ||
1537 |                        values["og:description"] ||
1538 |                        values["weibo:article:description"] ||
1539 |                        values["weibo:webpage:description"] ||
1540 |                        values["description"] ||
1541 |                        values["twitter:description"];
1542 | 
1543 |     // get site name
1544 |     metadata.siteName = jsonld.siteName ||
1545 |                         values["og:site_name"];
1546 | 
1547 |     // get article published time
1548 |     metadata.publishedTime = jsonld.datePublished ||
1549 |       values["article:published_time"] || null;
1550 | 
1551 |     // in many sites the meta value is escaped with HTML entities,
1552 |     // so here we need to unescape it
1553 |     metadata.title = this._unescapeHtmlEntities(metadata.title);
1554 |     metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1555 |     metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1556 |     metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1557 |     metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1558 | 
1559 |     return metadata;
1560 |   },
1561 | 
1562 |   /**
1563 |    * Check if node is image, or if node contains exactly only one image
1564 |    * whether as a direct child or as its descendants.
1565 |    *
1566 |    * @param Element
1567 |   **/
1568 |   _isSingleImage: function(node) {
1569 |     if (node.tagName === "IMG") {
1570 |       return true;
1571 |     }
1572 | 
1573 |     if (node.children.length !== 1 || node.textContent.trim() !== "") {
1574 |       return false;
1575 |     }
1576 | 
1577 |     return this._isSingleImage(node.children[0]);
1578 |   },
1579 | 
1580 |   /**
1581 |    * Find all <noscript> that are located after <img> nodes, and which contain only one
1582 |    * <img> element. Replace the first image with the image from inside the <noscript> tag,
1583 |    * and remove the <noscript> tag. This improves the quality of the images we use on
1584 |    * some sites (e.g. Medium).
1585 |    *
1586 |    * @param Element
1587 |   **/
1588 |   _unwrapNoscriptImages: function(doc) {
1589 |     // Find img without source or attributes that might contains image, and remove it.
1590 |     // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1591 |     var imgs = Array.from(doc.getElementsByTagName("img"));
1592 |     this._forEachNode(imgs, function(img) {
1593 |       for (var i = 0; i < img.attributes.length; i++) {
1594 |         var attr = img.attributes[i];
1595 |         switch (attr.name) {
1596 |           case "src":
1597 |           case "srcset":
1598 |           case "data-src":
1599 |           case "data-srcset":
1600 |             return;
1601 |         }
1602 | 
1603 |         if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1604 |           return;
1605 |         }
1606 |       }
1607 | 
1608 |       img.parentNode.removeChild(img);
1609 |     });
1610 | 
1611 |     // Next find noscript and try to extract its image
1612 |     var noscripts = Array.from(doc.getElementsByTagName("noscript"));
1613 |     this._forEachNode(noscripts, function(noscript) {
1614 |       // Parse content of noscript and make sure it only contains image
1615 |       var tmp = doc.createElement("div");
1616 |       tmp.innerHTML = noscript.innerHTML;
1617 |       if (!this._isSingleImage(tmp)) {
1618 |         return;
1619 |       }
1620 | 
1621 |       // If noscript has previous sibling and it only contains image,
1622 |       // replace it with noscript content. However we also keep old
1623 |       // attributes that might contains image.
1624 |       var prevElement = noscript.previousElementSibling;
1625 |       if (prevElement && this._isSingleImage(prevElement)) {
1626 |         var prevImg = prevElement;
1627 |         if (prevImg.tagName !== "IMG") {
1628 |           prevImg = prevElement.getElementsByTagName("img")[0];
1629 |         }
1630 | 
1631 |         var newImg = tmp.getElementsByTagName("img")[0];
1632 |         for (var i = 0; i < prevImg.attributes.length; i++) {
1633 |           var attr = prevImg.attributes[i];
1634 |           if (attr.value === "") {
1635 |             continue;
1636 |           }
1637 | 
1638 |           if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1639 |             if (newImg.getAttribute(attr.name) === attr.value) {
1640 |               continue;
1641 |             }
1642 | 
1643 |             var attrName = attr.name;
1644 |             if (newImg.hasAttribute(attrName)) {
1645 |               attrName = "data-old-" + attrName;
1646 |             }
1647 | 
1648 |             newImg.setAttribute(attrName, attr.value);
1649 |           }
1650 |         }
1651 | 
1652 |         noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1653 |       }
1654 |     });
1655 |   },
1656 | 
1657 |   /**
1658 |    * Removes script tags from the document.
1659 |    *
1660 |    * @param Element
1661 |   **/
1662 |   _removeScripts: function(doc) {
1663 |     this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
1664 |   },
1665 | 
1666 |   /**
1667 |    * Check if this node has only whitespace and a single element with given tag
1668 |    * Returns false if the DIV node contains non-empty text nodes
1669 |    * or if it contains no element with given tag or more than 1 element.
1670 |    *
1671 |    * @param Element
1672 |    * @param string tag of child element
1673 |   **/
1674 |   _hasSingleTagInsideElement: function(element, tag) {
1675 |     // There should be exactly 1 element child with given tag
1676 |     if (element.children.length != 1 || element.children[0].tagName !== tag) {
1677 |       return false;
1678 |     }
1679 | 
1680 |     // And there should be no text nodes with real content
1681 |     return !this._someNode(element.childNodes, function(node) {
1682 |       return node.nodeType === this.TEXT_NODE &&
1683 |              this.REGEXPS.hasContent.test(node.textContent);
1684 |     });
1685 |   },
1686 | 
1687 |   _isElementWithoutContent: function(node) {
1688 |     return node.nodeType === this.ELEMENT_NODE &&
1689 |       node.textContent.trim().length == 0 &&
1690 |       (node.children.length == 0 ||
1691 |        node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
1692 |   },
1693 | 
1694 |   /**
1695 |    * Determine whether element has any children block level elements.
1696 |    *
1697 |    * @param Element
1698 |    */
1699 |   _hasChildBlockElement: function (element) {
1700 |     return this._someNode(element.childNodes, function(node) {
1701 |       return this.DIV_TO_P_ELEMS.has(node.tagName) ||
1702 |              this._hasChildBlockElement(node);
1703 |     });
1704 |   },
1705 | 
1706 |   /***
1707 |    * Determine if a node qualifies as phrasing content.
1708 |    * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
1709 |   **/
1710 |   _isPhrasingContent: function(node) {
1711 |     return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
1712 |       ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
1713 |         this._everyNode(node.childNodes, this._isPhrasingContent));
1714 |   },
1715 | 
1716 |   _isWhitespace: function(node) {
1717 |     return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
1718 |            (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
1719 |   },
1720 | 
1721 |   /**
1722 |    * Get the inner text of a node - cross browser compatibly.
1723 |    * This also strips out any excess whitespace to be found.
1724 |    *
1725 |    * @param Element
1726 |    * @param Boolean normalizeSpaces (default: true)
1727 |    * @return string
1728 |   **/
1729 |   _getInnerText: function(e, normalizeSpaces) {
1730 |     normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
1731 |     var textContent = e.textContent.trim();
1732 | 
1733 |     if (normalizeSpaces) {
1734 |       return textContent.replace(this.REGEXPS.normalize, " ");
1735 |     }
1736 |     return textContent;
1737 |   },
1738 | 
1739 |   /**
1740 |    * Get the number of times a string s appears in the node e.
1741 |    *
1742 |    * @param Element
1743 |    * @param string - what to split on. Default is ","
1744 |    * @return number (integer)
1745 |   **/
1746 |   _getCharCount: function(e, s) {
1747 |     s = s || ",";
1748 |     return this._getInnerText(e).split(s).length - 1;
1749 |   },
1750 | 
1751 |   /**
1752 |    * Remove the style attribute on every e and under.
1753 |    * TODO: Test if getElementsByTagName(*) is faster.
1754 |    *
1755 |    * @param Element
1756 |    * @return void
1757 |   **/
1758 |   _cleanStyles: function(e) {
1759 |     if (!e || e.tagName.toLowerCase() === "svg")
1760 |       return;
1761 | 
1762 |     // Remove `style` and deprecated presentational attributes
1763 |     for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
1764 |       e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
1765 |     }
1766 | 
1767 |     if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
1768 |       e.removeAttribute("width");
1769 |       e.removeAttribute("height");
1770 |     }
1771 | 
1772 |     var cur = e.firstElementChild;
1773 |     while (cur !== null) {
1774 |       this._cleanStyles(cur);
1775 |       cur = cur.nextElementSibling;
1776 |     }
1777 |   },
1778 | 
1779 |   /**
1780 |    * Get the density of links as a percentage of the content
1781 |    * This is the amount of text that is inside a link divided by the total text in the node.
1782 |    *
1783 |    * @param Element
1784 |    * @return number (float)
1785 |   **/
1786 |   _getLinkDensity: function(element) {
1787 |     var textLength = this._getInnerText(element).length;
1788 |     if (textLength === 0)
1789 |       return 0;
1790 | 
1791 |     var linkLength = 0;
1792 | 
1793 |     // XXX implement _reduceNodeList?
1794 |     this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
1795 |       var href = linkNode.getAttribute("href");
1796 |       var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
1797 |       linkLength += this._getInnerText(linkNode).length * coefficient;
1798 |     });
1799 | 
1800 |     return linkLength / textLength;
1801 |   },
1802 | 
1803 |   /**
1804 |    * Get an elements class/id weight. Uses regular expressions to tell if this
1805 |    * element looks good or bad.
1806 |    *
1807 |    * @param Element
1808 |    * @return number (Integer)
1809 |   **/
1810 |   _getClassWeight: function(e) {
1811 |     if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
1812 |       return 0;
1813 | 
1814 |     var weight = 0;
1815 | 
1816 |     // Look for a special classname
1817 |     if (typeof(e.className) === "string" && e.className !== "") {
1818 |       if (this.REGEXPS.negative.test(e.className))
1819 |         weight -= 25;
1820 | 
1821 |       if (this.REGEXPS.positive.test(e.className))
1822 |         weight += 25;
1823 |     }
1824 | 
1825 |     // Look for a special ID
1826 |     if (typeof(e.id) === "string" && e.id !== "") {
1827 |       if (this.REGEXPS.negative.test(e.id))
1828 |         weight -= 25;
1829 | 
1830 |       if (this.REGEXPS.positive.test(e.id))
1831 |         weight += 25;
1832 |     }
1833 | 
1834 |     return weight;
1835 |   },
1836 | 
1837 |   /**
1838 |    * Clean a node of all elements of type "tag".
1839 |    * (Unless it's a youtube/vimeo video. People love movies.)
1840 |    *
1841 |    * @param Element
1842 |    * @param string tag to clean
1843 |    * @return void
1844 |    **/
1845 |   _clean: function(e, tag) {
1846 |     var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
1847 | 
1848 |     this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
1849 |       // Allow youtube and vimeo videos through as people usually want to see those.
1850 |       if (isEmbed) {
1851 |         // First, check the elements attributes to see if any of them contain youtube or vimeo
1852 |         for (var i = 0; i < element.attributes.length; i++) {
1853 |           if (this._allowedVideoRegex.test(element.attributes[i].value)) {
1854 |             return false;
1855 |           }
1856 |         }
1857 | 
1858 |         // For embed with <object> tag, check inner HTML as well.
1859 |         if (element.tagName === "object" && this._allowedVideoRegex.test(element.innerHTML)) {
1860 |           return false;
1861 |         }
1862 |       }
1863 | 
1864 |       return true;
1865 |     });
1866 |   },
1867 | 
1868 |   /**
1869 |    * Check if a given node has one of its ancestor tag name matching the
1870 |    * provided one.
1871 |    * @param  HTMLElement node
1872 |    * @param  String      tagName
1873 |    * @param  Number      maxDepth
1874 |    * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
1875 |    * @return Boolean
1876 |    */
1877 |   _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
1878 |     maxDepth = maxDepth || 3;
1879 |     tagName = tagName.toUpperCase();
1880 |     var depth = 0;
1881 |     while (node.parentNode) {
1882 |       if (maxDepth > 0 && depth > maxDepth)
1883 |         return false;
1884 |       if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
1885 |         return true;
1886 |       node = node.parentNode;
1887 |       depth++;
1888 |     }
1889 |     return false;
1890 |   },
1891 | 
1892 |   /**
1893 |    * Return an object indicating how many rows and columns this table has.
1894 |    */
1895 |   _getRowAndColumnCount: function(table) {
1896 |     var rows = 0;
1897 |     var columns = 0;
1898 |     var trs = table.getElementsByTagName("tr");
1899 |     for (var i = 0; i < trs.length; i++) {
1900 |       var rowspan = trs[i].getAttribute("rowspan") || 0;
1901 |       if (rowspan) {
1902 |         rowspan = parseInt(rowspan, 10);
1903 |       }
1904 |       rows += (rowspan || 1);
1905 | 
1906 |       // Now look for column-related info
1907 |       var columnsInThisRow = 0;
1908 |       var cells = trs[i].getElementsByTagName("td");
1909 |       for (var j = 0; j < cells.length; j++) {
1910 |         var colspan = cells[j].getAttribute("colspan") || 0;
1911 |         if (colspan) {
1912 |           colspan = parseInt(colspan, 10);
1913 |         }
1914 |         columnsInThisRow += (colspan || 1);
1915 |       }
1916 |       columns = Math.max(columns, columnsInThisRow);
1917 |     }
1918 |     return {rows: rows, columns: columns};
1919 |   },
1920 | 
1921 |   /**
1922 |    * Look for 'data' (as opposed to 'layout') tables, for which we use
1923 |    * similar checks as
1924 |    * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
1925 |    */
1926 |   _markDataTables: function(root) {
1927 |     var tables = root.getElementsByTagName("table");
1928 |     for (var i = 0; i < tables.length; i++) {
1929 |       var table = tables[i];
1930 |       var role = table.getAttribute("role");
1931 |       if (role == "presentation") {
1932 |         table._readabilityDataTable = false;
1933 |         continue;
1934 |       }
1935 |       var datatable = table.getAttribute("datatable");
1936 |       if (datatable == "0") {
1937 |         table._readabilityDataTable = false;
1938 |         continue;
1939 |       }
1940 |       var summary = table.getAttribute("summary");
1941 |       if (summary) {
1942 |         table._readabilityDataTable = true;
1943 |         continue;
1944 |       }
1945 | 
1946 |       var caption = table.getElementsByTagName("caption")[0];
1947 |       if (caption && caption.childNodes.length > 0) {
1948 |         table._readabilityDataTable = true;
1949 |         continue;
1950 |       }
1951 | 
1952 |       // If the table has a descendant with any of these tags, consider a data table:
1953 |       var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
1954 |       var descendantExists = function(tag) {
1955 |         return !!table.getElementsByTagName(tag)[0];
1956 |       };
1957 |       if (dataTableDescendants.some(descendantExists)) {
1958 |         this.log("Data table because found data-y descendant");
1959 |         table._readabilityDataTable = true;
1960 |         continue;
1961 |       }
1962 | 
1963 |       // Nested tables indicate a layout table:
1964 |       if (table.getElementsByTagName("table")[0]) {
1965 |         table._readabilityDataTable = false;
1966 |         continue;
1967 |       }
1968 | 
1969 |       var sizeInfo = this._getRowAndColumnCount(table);
1970 |       if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
1971 |         table._readabilityDataTable = true;
1972 |         continue;
1973 |       }
1974 |       // Now just go by size entirely:
1975 |       table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
1976 |     }
1977 |   },
1978 | 
1979 |   /* convert images and figures that have properties like data-src into images that can be loaded without JS */
1980 |   _fixLazyImages: function (root) {
1981 |     this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
1982 |       // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
1983 |       // So, here we check if the data uri is too short, just might as well remove it.
1984 |       if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
1985 |         // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
1986 |         var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
1987 |         if (parts[1] === "image/svg+xml") {
1988 |           return;
1989 |         }
1990 | 
1991 |         // Make sure this element has other attributes which contains image.
1992 |         // If it doesn't, then this src is important and shouldn't be removed.
1993 |         var srcCouldBeRemoved = false;
1994 |         for (var i = 0; i < elem.attributes.length; i++) {
1995 |           var attr = elem.attributes[i];
1996 |           if (attr.name === "src") {
1997 |             continue;
1998 |           }
1999 | 
2000 |           if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2001 |             srcCouldBeRemoved = true;
2002 |             break;
2003 |           }
2004 |         }
2005 | 
2006 |         // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
2007 |         // it will be too small, therefore it might be placeholder image.
2008 |         if (srcCouldBeRemoved) {
2009 |           var b64starts = elem.src.search(/base64\s*/i) + 7;
2010 |           var b64length = elem.src.length - b64starts;
2011 |           if (b64length < 133) {
2012 |             elem.removeAttribute("src");
2013 |           }
2014 |         }
2015 |       }
2016 | 
2017 |       // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2018 |       if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
2019 |         return;
2020 |       }
2021 | 
2022 |       for (var j = 0; j < elem.attributes.length; j++) {
2023 |         attr = elem.attributes[j];
2024 |         if (attr.name === "src" || attr.name === "srcset" || attr.name === "alt") {
2025 |           continue;
2026 |         }
2027 |         var copyTo = null;
2028 |         if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
2029 |           copyTo = "srcset";
2030 |         } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
2031 |           copyTo = "src";
2032 |         }
2033 |         if (copyTo) {
2034 |           //if this is an img or picture, set the attribute directly
2035 |           if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
2036 |             elem.setAttribute(copyTo, attr.value);
2037 |           } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
2038 |             //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2039 |             //see the nytimes-3 testcase for an example
2040 |             var img = this._doc.createElement("img");
2041 |             img.setAttribute(copyTo, attr.value);
2042 |             elem.appendChild(img);
2043 |           }
2044 |         }
2045 |       }
2046 |     });
2047 |   },
2048 | 
2049 |   _getTextDensity: function(e, tags) {
2050 |     var textLength = this._getInnerText(e, true).length;
2051 |     if (textLength === 0) {
2052 |       return 0;
2053 |     }
2054 |     var childrenLength = 0;
2055 |     var children = this._getAllNodesWithTag(e, tags);
2056 |     this._forEachNode(children, (child) => childrenLength += this._getInnerText(child, true).length);
2057 |     return childrenLength / textLength;
2058 |   },
2059 | 
2060 |   /**
2061 |    * Clean an element of all tags of type "tag" if they look fishy.
2062 |    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2063 |    *
2064 |    * @return void
2065 |    **/
2066 |   _cleanConditionally: function(e, tag) {
2067 |     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
2068 |       return;
2069 | 
2070 |     // Gather counts for other typical elements embedded within.
2071 |     // Traverse backwards so we can remove nodes at the same time
2072 |     // without effecting the traversal.
2073 |     //
2074 |     // TODO: Consider taking into account original contentScore here.
2075 |     this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
2076 |       // First check if this node IS data table, in which case don't remove it.
2077 |       var isDataTable = function(t) {
2078 |         return t._readabilityDataTable;
2079 |       };
2080 | 
2081 |       var isList = tag === "ul" || tag === "ol";
2082 |       if (!isList) {
2083 |         var listLength = 0;
2084 |         var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
2085 |         this._forEachNode(listNodes, (list) => listLength += this._getInnerText(list).length);
2086 |         isList = listLength / this._getInnerText(node).length > 0.9;
2087 |       }
2088 | 
2089 |       if (tag === "table" && isDataTable(node)) {
2090 |         return false;
2091 |       }
2092 | 
2093 |       // Next check if we're inside a data table, in which case don't remove it as well.
2094 |       if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
2095 |         return false;
2096 |       }
2097 | 
2098 |       if (this._hasAncestorTag(node, "code")) {
2099 |         return false;
2100 |       }
2101 | 
2102 |       var weight = this._getClassWeight(node);
2103 | 
2104 |       this.log("Cleaning Conditionally", node);
2105 | 
2106 |       var contentScore = 0;
2107 | 
2108 |       if (weight + contentScore < 0) {
2109 |         return true;
2110 |       }
2111 | 
2112 |       if (this._getCharCount(node, ",") < 10) {
2113 |         // If there are not very many commas, and the number of
2114 |         // non-paragraph elements is more than paragraphs or other
2115 |         // ominous signs, remove the element.
2116 |         var p = node.getElementsByTagName("p").length;
2117 |         var img = node.getElementsByTagName("img").length;
2118 |         var li = node.getElementsByTagName("li").length - 100;
2119 |         var input = node.getElementsByTagName("input").length;
2120 |         var headingDensity = this._getTextDensity(node, ["h1", "h2", "h3", "h4", "h5", "h6"]);
2121 | 
2122 |         var embedCount = 0;
2123 |         var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
2124 | 
2125 |         for (var i = 0; i < embeds.length; i++) {
2126 |           // If this embed has attribute that matches video regex, don't delete it.
2127 |           for (var j = 0; j < embeds[i].attributes.length; j++) {
2128 |             if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2129 |               return false;
2130 |             }
2131 |           }
2132 | 
2133 |           // For embed with <object> tag, check inner HTML as well.
2134 |           if (embeds[i].tagName === "object" && this._allowedVideoRegex.test(embeds[i].innerHTML)) {
2135 |             return false;
2136 |           }
2137 | 
2138 |           embedCount++;
2139 |         }
2140 | 
2141 |         var linkDensity = this._getLinkDensity(node);
2142 |         var contentLength = this._getInnerText(node).length;
2143 | 
2144 |         var haveToRemove =
2145 |           (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
2146 |           (!isList && li > p) ||
2147 |           (input > Math.floor(p/3)) ||
2148 |           (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
2149 |           (!isList && weight < 25 && linkDensity > 0.2) ||
2150 |           (weight >= 25 && linkDensity > 0.5) ||
2151 |           ((embedCount === 1 && contentLength < 75) || embedCount > 1);
2152 |         // Allow simple lists of images to remain in pages
2153 |         if (isList && haveToRemove) {
2154 |           for (var x = 0; x < node.children.length; x++) {
2155 |             let child = node.children[x];
2156 |             // Don't filter in lists with li's that contain more than one child
2157 |             if (child.children.length > 1) {
2158 |               return haveToRemove;
2159 |             }
2160 |           }
2161 |           let li_count = node.getElementsByTagName("li").length;
2162 |           // Only allow the list to remain if every li contains an image
2163 |           if (img == li_count) {
2164 |             return false;
2165 |           }
2166 |         }
2167 |         return haveToRemove;
2168 |       }
2169 |       return false;
2170 |     });
2171 |   },
2172 | 
2173 |   /**
2174 |    * Clean out elements that match the specified conditions
2175 |    *
2176 |    * @param Element
2177 |    * @param Function determines whether a node should be removed
2178 |    * @return void
2179 |    **/
2180 |   _cleanMatchedNodes: function(e, filter) {
2181 |     var endOfSearchMarkerNode = this._getNextNode(e, true);
2182 |     var next = this._getNextNode(e);
2183 |     while (next && next != endOfSearchMarkerNode) {
2184 |       if (filter.call(this, next, next.className + " " + next.id)) {
2185 |         next = this._removeAndGetNext(next);
2186 |       } else {
2187 |         next = this._getNextNode(next);
2188 |       }
2189 |     }
2190 |   },
2191 | 
2192 |   /**
2193 |    * Clean out spurious headers from an Element.
2194 |    *
2195 |    * @param Element
2196 |    * @return void
2197 |   **/
2198 |   _cleanHeaders: function(e) {
2199 |     let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
2200 |     this._removeNodes(headingNodes, function(node) {
2201 |       let shouldRemove = this._getClassWeight(node) < 0;
2202 |       if (shouldRemove) {
2203 |         this.log("Removing header with low class weight:", node);
2204 |       }
2205 |       return shouldRemove;
2206 |     });
2207 |   },
2208 | 
2209 |   /**
2210 |    * Check if this node is an H1 or H2 element whose content is mostly
2211 |    * the same as the article title.
2212 |    *
2213 |    * @param Element  the node to check.
2214 |    * @return boolean indicating whether this is a title-like header.
2215 |    */
2216 |   _headerDuplicatesTitle: function(node) {
2217 |     if (node.tagName != "H1" && node.tagName != "H2") {
2218 |       return false;
2219 |     }
2220 |     var heading = this._getInnerText(node, false);
2221 |     this.log("Evaluating similarity of header:", heading, this._articleTitle);
2222 |     return this._textSimilarity(this._articleTitle, heading) > 0.75;
2223 |   },
2224 | 
2225 |   _flagIsActive: function(flag) {
2226 |     return (this._flags & flag) > 0;
2227 |   },
2228 | 
2229 |   _removeFlag: function(flag) {
2230 |     this._flags = this._flags & ~flag;
2231 |   },
2232 | 
2233 |   _isProbablyVisible: function(node) {
2234 |     // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
2235 |     return (!node.style || node.style.display != "none")
2236 |       && (!node.style || node.style.visibility != "hidden")
2237 |       && !node.hasAttribute("hidden")
2238 |       //check for "fallback-image" so that wikimedia math images are displayed
2239 |       && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
2240 |   },
2241 | 
2242 |   /**
2243 |    * Runs readability.
2244 |    *
2245 |    * Workflow:
2246 |    *  1. Prep the document by removing script tags, css, etc.
2247 |    *  2. Build readability's DOM tree.
2248 |    *  3. Grab the article content from the current dom tree.
2249 |    *  4. Replace the current DOM tree with the new one.
2250 |    *  5. Read peacefully.
2251 |    *
2252 |    * @return void
2253 |    **/
2254 |   parse: function () {
2255 |     // Avoid parsing too large documents, as per configuration option
2256 |     if (this._maxElemsToParse > 0) {
2257 |       var numTags = this._doc.getElementsByTagName("*").length;
2258 |       if (numTags > this._maxElemsToParse) {
2259 |         throw new Error("Aborting parsing document; " + numTags + " elements found");
2260 |       }
2261 |     }
2262 | 
2263 |     // Unwrap image from noscript
2264 |     this._unwrapNoscriptImages(this._doc);
2265 | 
2266 |     // Extract JSON-LD metadata before removing scripts
2267 |     var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2268 | 
2269 |     // Remove script tags from the document.
2270 |     this._removeScripts(this._doc);
2271 | 
2272 |     this._prepDocument();
2273 | 
2274 |     var metadata = this._getArticleMetadata(jsonLd);
2275 |     this._articleTitle = metadata.title;
2276 | 
2277 |     var articleContent = this._grabArticle();
2278 |     if (!articleContent)
2279 |       return null;
2280 | 
2281 |     this.log("Grabbed: " + articleContent.innerHTML);
2282 | 
2283 |     this._postProcessContent(articleContent);
2284 | 
2285 |     // If we haven't found an excerpt in the article's metadata, use the article's
2286 |     // first paragraph as the excerpt. This is used for displaying a preview of
2287 |     // the article's content.
2288 |     if (!metadata.excerpt) {
2289 |       var paragraphs = articleContent.getElementsByTagName("p");
2290 |       if (paragraphs.length > 0) {
2291 |         metadata.excerpt = paragraphs[0].textContent.trim();
2292 |       }
2293 |     }
2294 | 
2295 |     var textContent = articleContent.textContent;
2296 |     return {
2297 |       title: this._articleTitle,
2298 |       byline: metadata.byline || this._articleByline,
2299 |       dir: this._articleDir,
2300 |       lang: this._articleLang,
2301 |       content: this._serializer(articleContent),
2302 |       textContent: textContent,
2303 |       length: textContent.length,
2304 |       excerpt: metadata.excerpt,
2305 |       siteName: metadata.siteName || this._articleSiteName,
2306 |       publishedTime: metadata.publishedTime
2307 |     };
2308 |   }
2309 | };
2310 | 
2311 | if (typeof module === "object") {
2312 |   /* global module */
2313 |   module.exports = Readability;
2314 | }
2315 | 
```