hangwin/mcp-chrome # codebase.md

This is page 8 of 10. Use http://codebase.md/hangwin/mcp-chrome?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .gitattributes
├── .github
│   └── workflows
│       └── build-release.yml
├── .gitignore
├── .husky
│   ├── commit-msg
│   └── pre-commit
├── .prettierignore
├── .prettierrc.json
├── .vscode
│   └── extensions.json
├── app
│   ├── chrome-extension
│   │   ├── _locales
│   │   │   ├── de
│   │   │   │   └── messages.json
│   │   │   ├── en
│   │   │   │   └── messages.json
│   │   │   ├── ja
│   │   │   │   └── messages.json
│   │   │   ├── ko
│   │   │   │   └── messages.json
│   │   │   ├── zh_CN
│   │   │   │   └── messages.json
│   │   │   └── zh_TW
│   │   │       └── messages.json
│   │   ├── .env.example
│   │   ├── assets
│   │   │   └── vue.svg
│   │   ├── common
│   │   │   ├── constants.ts
│   │   │   ├── message-types.ts
│   │   │   └── tool-handler.ts
│   │   ├── entrypoints
│   │   │   ├── background
│   │   │   │   ├── index.ts
│   │   │   │   ├── native-host.ts
│   │   │   │   ├── semantic-similarity.ts
│   │   │   │   ├── storage-manager.ts
│   │   │   │   └── tools
│   │   │   │       ├── base-browser.ts
│   │   │   │       ├── browser
│   │   │   │       │   ├── bookmark.ts
│   │   │   │       │   ├── common.ts
│   │   │   │       │   ├── console.ts
│   │   │   │       │   ├── file-upload.ts
│   │   │   │       │   ├── history.ts
│   │   │   │       │   ├── index.ts
│   │   │   │       │   ├── inject-script.ts
│   │   │   │       │   ├── interaction.ts
│   │   │   │       │   ├── keyboard.ts
│   │   │   │       │   ├── network-capture-debugger.ts
│   │   │   │       │   ├── network-capture-web-request.ts
│   │   │   │       │   ├── network-request.ts
│   │   │   │       │   ├── screenshot.ts
│   │   │   │       │   ├── vector-search.ts
│   │   │   │       │   ├── web-fetcher.ts
│   │   │   │       │   └── window.ts
│   │   │   │       └── index.ts
│   │   │   ├── content.ts
│   │   │   ├── offscreen
│   │   │   │   ├── index.html
│   │   │   │   └── main.ts
│   │   │   └── popup
│   │   │       ├── App.vue
│   │   │       ├── components
│   │   │       │   ├── ConfirmDialog.vue
│   │   │       │   ├── icons
│   │   │       │   │   ├── BoltIcon.vue
│   │   │       │   │   ├── CheckIcon.vue
│   │   │       │   │   ├── DatabaseIcon.vue
│   │   │       │   │   ├── DocumentIcon.vue
│   │   │       │   │   ├── index.ts
│   │   │       │   │   ├── TabIcon.vue
│   │   │       │   │   ├── TrashIcon.vue
│   │   │       │   │   └── VectorIcon.vue
│   │   │       │   ├── ModelCacheManagement.vue
│   │   │       │   └── ProgressIndicator.vue
│   │   │       ├── index.html
│   │   │       ├── main.ts
│   │   │       └── style.css
│   │   ├── eslint.config.js
│   │   ├── inject-scripts
│   │   │   ├── click-helper.js
│   │   │   ├── fill-helper.js
│   │   │   ├── inject-bridge.js
│   │   │   ├── interactive-elements-helper.js
│   │   │   ├── keyboard-helper.js
│   │   │   ├── network-helper.js
│   │   │   ├── screenshot-helper.js
│   │   │   └── web-fetcher-helper.js
│   │   ├── LICENSE
│   │   ├── package.json
│   │   ├── public
│   │   │   ├── icon
│   │   │   │   ├── 128.png
│   │   │   │   ├── 16.png
│   │   │   │   ├── 32.png
│   │   │   │   ├── 48.png
│   │   │   │   └── 96.png
│   │   │   ├── libs
│   │   │   │   └── ort.min.js
│   │   │   └── wxt.svg
│   │   ├── README.md
│   │   ├── tsconfig.json
│   │   ├── utils
│   │   │   ├── content-indexer.ts
│   │   │   ├── i18n.ts
│   │   │   ├── image-utils.ts
│   │   │   ├── lru-cache.ts
│   │   │   ├── model-cache-manager.ts
│   │   │   ├── offscreen-manager.ts
│   │   │   ├── semantic-similarity-engine.ts
│   │   │   ├── simd-math-engine.ts
│   │   │   ├── text-chunker.ts
│   │   │   └── vector-database.ts
│   │   ├── workers
│   │   │   ├── ort-wasm-simd-threaded.jsep.mjs
│   │   │   ├── ort-wasm-simd-threaded.jsep.wasm
│   │   │   ├── ort-wasm-simd-threaded.mjs
│   │   │   ├── ort-wasm-simd-threaded.wasm
│   │   │   ├── simd_math_bg.wasm
│   │   │   ├── simd_math.js
│   │   │   └── similarity.worker.js
│   │   └── wxt.config.ts
│   └── native-server
│       ├── debug.sh
│       ├── install.md
│       ├── jest.config.js
│       ├── package.json
│       ├── README.md
│       ├── src
│       │   ├── cli.ts
│       │   ├── constant
│       │   │   └── index.ts
│       │   ├── file-handler.ts
│       │   ├── index.ts
│       │   ├── mcp
│       │   │   ├── mcp-server-stdio.ts
│       │   │   ├── mcp-server.ts
│       │   │   ├── register-tools.ts
│       │   │   └── stdio-config.json
│       │   ├── native-messaging-host.ts
│       │   ├── scripts
│       │   │   ├── browser-config.ts
│       │   │   ├── build.ts
│       │   │   ├── constant.ts
│       │   │   ├── postinstall.ts
│       │   │   ├── register-dev.ts
│       │   │   ├── register.ts
│       │   │   ├── run_host.bat
│       │   │   ├── run_host.sh
│       │   │   └── utils.ts
│       │   ├── server
│       │   │   ├── index.ts
│       │   │   └── server.test.ts
│       │   └── util
│       │       └── logger.ts
│       └── tsconfig.json
├── commitlint.config.cjs
├── docs
│   ├── ARCHITECTURE_zh.md
│   ├── ARCHITECTURE.md
│   ├── CHANGELOG.md
│   ├── CONTRIBUTING_zh.md
│   ├── CONTRIBUTING.md
│   ├── TOOLS_zh.md
│   ├── TOOLS.md
│   ├── TROUBLESHOOTING_zh.md
│   ├── TROUBLESHOOTING.md
│   └── WINDOWS_INSTALL_zh.md
├── eslint.config.js
├── LICENSE
├── package.json
├── packages
│   ├── shared
│   │   ├── package.json
│   │   ├── src
│   │   │   ├── constants.ts
│   │   │   ├── index.ts
│   │   │   ├── tools.ts
│   │   │   └── types.ts
│   │   └── tsconfig.json
│   └── wasm-simd
│       ├── .gitignore
│       ├── BUILD.md
│       ├── Cargo.toml
│       ├── package.json
│       ├── README.md
│       └── src
│           └── lib.rs
├── pnpm-lock.yaml
├── pnpm-workspace.yaml
├── prompt
│   ├── content-analize.md
│   ├── excalidraw-prompt.md
│   └── modify-web.md
├── README_zh.md
├── README.md
├── releases
│   ├── chrome-extension
│   │   └── latest
│   │       └── chrome-mcp-server-lastest.zip
│   └── README.md
└── test-inject-script.js
```

# Files

--------------------------------------------------------------------------------
/app/chrome-extension/inject-scripts/web-fetcher-helper.js:
--------------------------------------------------------------------------------

```javascript
   1 | /* eslint-disable */
   2 | 
   3 | if (window.__WEB_FETCHER_HELPER_INITIALIZED__) {
   4 |   // Already initialized, skip
   5 | } else {
   6 |   window.__WEB_FETCHER_HELPER_INITIALIZED__ = true;
   7 | 
   8 |   /*
   9 |    * Copyright (c) 2010 Arc90 Inc
  10 |    *
  11 |    * Licensed under the Apache License, Version 2.0 (the "License");
  12 |    * you may not use this file except in compliance with the License.
  13 |    * You may obtain a copy of the License at
  14 |    *
  15 |    *     http://www.apache.org/licenses/LICENSE-2.0
  16 |    *
  17 |    * Unless required by applicable law or agreed to in writing, software
  18 |    * distributed under the License is distributed on an "AS IS" BASIS,
  19 |    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  20 |    * See the License for the specific language governing permissions and
  21 |    * limitations under the License.
  22 |    */
  23 | 
  24 |   /*
  25 |    * This code is heavily based on Arc90's readability.js (1.7.1) script
  26 |    * available at: http://code.google.com/p/arc90labs-readability
  27 |    */
  28 | 
  29 |   /**
  30 |    * Public constructor.
  31 |    * @param {HTMLDocument} doc     The document to parse.
  32 |    * @param {Object}       options The options object.
  33 |    */
  34 |   function Readability(doc, options) {
  35 |     // In some older versions, people passed a URI as the first argument. Cope:
  36 |     if (options && options.documentElement) {
  37 |       doc = options;
  38 |       options = arguments[2];
  39 |     } else if (!doc || !doc.documentElement) {
  40 |       throw new Error('First argument to Readability constructor should be a document object.');
  41 |     }
  42 |     options = options || {};
  43 | 
  44 |     this._doc = doc;
  45 |     this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
  46 |     this._articleTitle = null;
  47 |     this._articleByline = null;
  48 |     this._articleDir = null;
  49 |     this._articleSiteName = null;
  50 |     this._attempts = [];
  51 |     this._metadata = {};
  52 | 
  53 |     // Configurable options
  54 |     this._debug = !!options.debug;
  55 |     this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  56 |     this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  57 |     this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
  58 |     this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
  59 |     this._keepClasses = !!options.keepClasses;
  60 |     this._serializer =
  61 |       options.serializer ||
  62 |       function (el) {
  63 |         return el.innerHTML;
  64 |       };
  65 |     this._disableJSONLD = !!options.disableJSONLD;
  66 |     this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
  67 |     this._linkDensityModifier = options.linkDensityModifier || 0;
  68 | 
  69 |     // Start with all flags set
  70 |     this._flags =
  71 |       this.FLAG_STRIP_UNLIKELYS | this.FLAG_WEIGHT_CLASSES | this.FLAG_CLEAN_CONDITIONALLY;
  72 | 
  73 |     // Control whether log messages are sent to the console
  74 |     if (this._debug) {
  75 |       let logNode = function (node) {
  76 |         if (node.nodeType == node.TEXT_NODE) {
  77 |           return `${node.nodeName} ("${node.textContent}")`;
  78 |         }
  79 |         let attrPairs = Array.from(node.attributes || [], function (attr) {
  80 |           return `${attr.name}="${attr.value}"`;
  81 |         }).join(' ');
  82 |         return `<${node.localName} ${attrPairs}>`;
  83 |       };
  84 |       this.log = function () {
  85 |         if (typeof console !== 'undefined') {
  86 |           let args = Array.from(arguments, (arg) => {
  87 |             if (arg && arg.nodeType == this.ELEMENT_NODE) {
  88 |               return logNode(arg);
  89 |             }
  90 |             return arg;
  91 |           });
  92 |           args.unshift('Reader: (Readability)');
  93 | 
  94 |           // Debug logging removed
  95 |         } else if (typeof dump !== 'undefined') {
  96 |           /* global dump */
  97 |           var msg = Array.prototype.map
  98 |             .call(arguments, function (x) {
  99 |               return x && x.nodeName ? logNode(x) : x;
 100 |             })
 101 |             .join(' ');
 102 |           dump('Reader: (Readability) ' + msg + '\n');
 103 |         }
 104 |       };
 105 |     } else {
 106 |       this.log = function () {};
 107 |     }
 108 |   }
 109 | 
 110 |   Readability.prototype = {
 111 |     FLAG_STRIP_UNLIKELYS: 0x1,
 112 |     FLAG_WEIGHT_CLASSES: 0x2,
 113 |     FLAG_CLEAN_CONDITIONALLY: 0x4,
 114 | 
 115 |     // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
 116 |     ELEMENT_NODE: 1,
 117 |     TEXT_NODE: 3,
 118 | 
 119 |     // Max number of nodes supported by this parser. Default: 0 (no limit)
 120 |     DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 121 | 
 122 |     // The number of top candidates to consider when analysing how
 123 |     // tight the competition is among candidates.
 124 |     DEFAULT_N_TOP_CANDIDATES: 5,
 125 | 
 126 |     // Element tags to score by default.
 127 |     DEFAULT_TAGS_TO_SCORE: 'section,h2,h3,h4,h5,h6,p,td,pre'.toUpperCase().split(','),
 128 | 
 129 |     // The default number of chars an article must have in order to return a result
 130 |     DEFAULT_CHAR_THRESHOLD: 500,
 131 | 
 132 |     // All of the regular expressions in use within readability.
 133 |     // Defined up here so we don't instantiate them repeatedly in loops.
 134 |     REGEXPS: {
 135 |       // NOTE: These two regular expressions are duplicated in
 136 |       // Readability-readerable.js. Please keep both copies in sync.
 137 |       unlikelyCandidates:
 138 |         /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
 139 |       okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
 140 | 
 141 |       positive:
 142 |         /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
 143 |       negative:
 144 |         /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
 145 |       extraneous:
 146 |         /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
 147 |       byline: /byline|author|dateline|writtenby|p-author/i,
 148 |       replaceFonts: /<(\/?)font[^>]*>/gi,
 149 |       normalize: /\s{2,}/g,
 150 |       videos:
 151 |         /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
 152 |       shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
 153 |       nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
 154 |       prevLink: /(prev|earl|old|new|<|«)/i,
 155 |       tokenize: /\W+/g,
 156 |       whitespace: /^\s*$/,
 157 |       hasContent: /\S$/,
 158 |       hashUrl: /^#.+/,
 159 |       srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
 160 |       b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
 161 |       // Commas as used in Latin, Sindhi, Chinese and various other scripts.
 162 |       // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
 163 |       commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
 164 |       // See: https://schema.org/Article
 165 |       jsonLdArticleTypes:
 166 |         /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
 167 |       // used to see if a node's content matches words commonly used for ad blocks or loading indicators
 168 |       adWords: /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
 169 |       loadingWords: /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
 170 |     },
 171 | 
 172 |     UNLIKELY_ROLES: [
 173 |       'menu',
 174 |       'menubar',
 175 |       'complementary',
 176 |       'navigation',
 177 |       'alert',
 178 |       'alertdialog',
 179 |       'dialog',
 180 |     ],
 181 | 
 182 |     DIV_TO_P_ELEMS: new Set(['BLOCKQUOTE', 'DL', 'DIV', 'IMG', 'OL', 'P', 'PRE', 'TABLE', 'UL']),
 183 | 
 184 |     ALTER_TO_DIV_EXCEPTIONS: ['DIV', 'ARTICLE', 'SECTION', 'P', 'OL', 'UL'],
 185 | 
 186 |     PRESENTATIONAL_ATTRIBUTES: [
 187 |       'align',
 188 |       'background',
 189 |       'bgcolor',
 190 |       'border',
 191 |       'cellpadding',
 192 |       'cellspacing',
 193 |       'frame',
 194 |       'hspace',
 195 |       'rules',
 196 |       'style',
 197 |       'valign',
 198 |       'vspace',
 199 |     ],
 200 | 
 201 |     DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ['TABLE', 'TH', 'TD', 'HR', 'PRE'],
 202 | 
 203 |     // The commented out elements qualify as phrasing content but tend to be
 204 |     // removed by readability when put into paragraphs, so we ignore them here.
 205 |     PHRASING_ELEMS: [
 206 |       // "CANVAS", "IFRAME", "SVG", "VIDEO",
 207 |       'ABBR',
 208 |       'AUDIO',
 209 |       'B',
 210 |       'BDO',
 211 |       'BR',
 212 |       'BUTTON',
 213 |       'CITE',
 214 |       'CODE',
 215 |       'DATA',
 216 |       'DATALIST',
 217 |       'DFN',
 218 |       'EM',
 219 |       'EMBED',
 220 |       'I',
 221 |       'IMG',
 222 |       'INPUT',
 223 |       'KBD',
 224 |       'LABEL',
 225 |       'MARK',
 226 |       'MATH',
 227 |       'METER',
 228 |       'NOSCRIPT',
 229 |       'OBJECT',
 230 |       'OUTPUT',
 231 |       'PROGRESS',
 232 |       'Q',
 233 |       'RUBY',
 234 |       'SAMP',
 235 |       'SCRIPT',
 236 |       'SELECT',
 237 |       'SMALL',
 238 |       'SPAN',
 239 |       'STRONG',
 240 |       'SUB',
 241 |       'SUP',
 242 |       'TEXTAREA',
 243 |       'TIME',
 244 |       'VAR',
 245 |       'WBR',
 246 |     ],
 247 | 
 248 |     // These are the classes that readability sets itself.
 249 |     CLASSES_TO_PRESERVE: ['page'],
 250 | 
 251 |     // These are the list of HTML entities that need to be escaped.
 252 |     HTML_ESCAPE_MAP: {
 253 |       lt: '<',
 254 |       gt: '>',
 255 |       amp: '&',
 256 |       quot: '"',
 257 |       apos: "'",
 258 |     },
 259 | 
 260 |     /**
 261 |      * Run any post-process modifications to article content as necessary.
 262 |      *
 263 |      * @param Element
 264 |      * @return void
 265 |      **/
 266 |     _postProcessContent(articleContent) {
 267 |       // Readability cannot open relative uris so we convert them to absolute uris.
 268 |       this._fixRelativeUris(articleContent);
 269 | 
 270 |       this._simplifyNestedElements(articleContent);
 271 | 
 272 |       if (!this._keepClasses) {
 273 |         // Remove classes.
 274 |         this._cleanClasses(articleContent);
 275 |       }
 276 |     },
 277 | 
 278 |     /**
 279 |      * Iterates over a NodeList, calls `filterFn` for each node and removes node
 280 |      * if function returned `true`.
 281 |      *
 282 |      * If function is not passed, removes all the nodes in node list.
 283 |      *
 284 |      * @param NodeList nodeList The nodes to operate on
 285 |      * @param Function filterFn the function to use as a filter
 286 |      * @return void
 287 |      */
 288 |     _removeNodes(nodeList, filterFn) {
 289 |       // Avoid ever operating on live node lists.
 290 |       if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 291 |         throw new Error('Do not pass live node lists to _removeNodes');
 292 |       }
 293 |       for (var i = nodeList.length - 1; i >= 0; i--) {
 294 |         var node = nodeList[i];
 295 |         var parentNode = node.parentNode;
 296 |         if (parentNode) {
 297 |           if (!filterFn || filterFn.call(this, node, i, nodeList)) {
 298 |             parentNode.removeChild(node);
 299 |           }
 300 |         }
 301 |       }
 302 |     },
 303 | 
 304 |     /**
 305 |      * Iterates over a NodeList, and calls _setNodeTag for each node.
 306 |      *
 307 |      * @param NodeList nodeList The nodes to operate on
 308 |      * @param String newTagName the new tag name to use
 309 |      * @return void
 310 |      */
 311 |     _replaceNodeTags(nodeList, newTagName) {
 312 |       // Avoid ever operating on live node lists.
 313 |       if (this._docJSDOMParser && nodeList._isLiveNodeList) {
 314 |         throw new Error('Do not pass live node lists to _replaceNodeTags');
 315 |       }
 316 |       for (const node of nodeList) {
 317 |         this._setNodeTag(node, newTagName);
 318 |       }
 319 |     },
 320 | 
 321 |     /**
 322 |      * Iterate over a NodeList, which doesn't natively fully implement the Array
 323 |      * interface.
 324 |      *
 325 |      * For convenience, the current object context is applied to the provided
 326 |      * iterate function.
 327 |      *
 328 |      * @param  NodeList nodeList The NodeList.
 329 |      * @param  Function fn       The iterate function.
 330 |      * @return void
 331 |      */
 332 |     _forEachNode(nodeList, fn) {
 333 |       Array.prototype.forEach.call(nodeList, fn, this);
 334 |     },
 335 | 
 336 |     /**
 337 |      * Iterate over a NodeList, and return the first node that passes
 338 |      * the supplied test function
 339 |      *
 340 |      * For convenience, the current object context is applied to the provided
 341 |      * test function.
 342 |      *
 343 |      * @param  NodeList nodeList The NodeList.
 344 |      * @param  Function fn       The test function.
 345 |      * @return void
 346 |      */
 347 |     _findNode(nodeList, fn) {
 348 |       return Array.prototype.find.call(nodeList, fn, this);
 349 |     },
 350 | 
 351 |     /**
 352 |      * Iterate over a NodeList, return true if any of the provided iterate
 353 |      * function calls returns true, false otherwise.
 354 |      *
 355 |      * For convenience, the current object context is applied to the
 356 |      * provided iterate function.
 357 |      *
 358 |      * @param  NodeList nodeList The NodeList.
 359 |      * @param  Function fn       The iterate function.
 360 |      * @return Boolean
 361 |      */
 362 |     _someNode(nodeList, fn) {
 363 |       return Array.prototype.some.call(nodeList, fn, this);
 364 |     },
 365 | 
 366 |     /**
 367 |      * Iterate over a NodeList, return true if all of the provided iterate
 368 |      * function calls return true, false otherwise.
 369 |      *
 370 |      * For convenience, the current object context is applied to the
 371 |      * provided iterate function.
 372 |      *
 373 |      * @param  NodeList nodeList The NodeList.
 374 |      * @param  Function fn       The iterate function.
 375 |      * @return Boolean
 376 |      */
 377 |     _everyNode(nodeList, fn) {
 378 |       return Array.prototype.every.call(nodeList, fn, this);
 379 |     },
 380 | 
 381 |     _getAllNodesWithTag(node, tagNames) {
 382 |       if (node.querySelectorAll) {
 383 |         return node.querySelectorAll(tagNames.join(','));
 384 |       }
 385 |       return [].concat.apply(
 386 |         [],
 387 |         tagNames.map(function (tag) {
 388 |           var collection = node.getElementsByTagName(tag);
 389 |           return Array.isArray(collection) ? collection : Array.from(collection);
 390 |         }),
 391 |       );
 392 |     },
 393 | 
 394 |     /**
 395 |      * Removes the class="" attribute from every element in the given
 396 |      * subtree, except those that match CLASSES_TO_PRESERVE and
 397 |      * the classesToPreserve array from the options object.
 398 |      *
 399 |      * @param Element
 400 |      * @return void
 401 |      */
 402 |     _cleanClasses(node) {
 403 |       var classesToPreserve = this._classesToPreserve;
 404 |       var className = (node.getAttribute('class') || '')
 405 |         .split(/\s+/)
 406 |         .filter((cls) => classesToPreserve.includes(cls))
 407 |         .join(' ');
 408 | 
 409 |       if (className) {
 410 |         node.setAttribute('class', className);
 411 |       } else {
 412 |         node.removeAttribute('class');
 413 |       }
 414 | 
 415 |       for (node = node.firstElementChild; node; node = node.nextElementSibling) {
 416 |         this._cleanClasses(node);
 417 |       }
 418 |     },
 419 | 
 420 |     /**
 421 |      * Tests whether a string is a URL or not.
 422 |      *
 423 |      * @param {string} str The string to test
 424 |      * @return {boolean} true if str is a URL, false if not
 425 |      */
 426 |     _isUrl(str) {
 427 |       try {
 428 |         new URL(str);
 429 |         return true;
 430 |       } catch {
 431 |         return false;
 432 |       }
 433 |     },
 434 |     /**
 435 |      * Converts each <a> and <img> uri in the given element to an absolute URI,
 436 |      * ignoring #ref URIs.
 437 |      *
 438 |      * @param Element
 439 |      * @return void
 440 |      */
 441 |     _fixRelativeUris(articleContent) {
 442 |       var baseURI = this._doc.baseURI;
 443 |       var documentURI = this._doc.documentURI;
 444 |       function toAbsoluteURI(uri) {
 445 |         // Leave hash links alone if the base URI matches the document URI:
 446 |         if (baseURI == documentURI && uri.charAt(0) == '#') {
 447 |           return uri;
 448 |         }
 449 | 
 450 |         // Otherwise, resolve against base URI:
 451 |         try {
 452 |           return new URL(uri, baseURI).href;
 453 |         } catch (ex) {
 454 |           // Something went wrong, just return the original:
 455 |         }
 456 |         return uri;
 457 |       }
 458 | 
 459 |       var links = this._getAllNodesWithTag(articleContent, ['a']);
 460 |       this._forEachNode(links, function (link) {
 461 |         var href = link.getAttribute('href');
 462 |         if (href) {
 463 |           // Remove links with javascript: URIs, since
 464 |           // they won't work after scripts have been removed from the page.
 465 |           if (href.indexOf('javascript:') === 0) {
 466 |             // if the link only contains simple text content, it can be converted to a text node
 467 |             if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
 468 |               var text = this._doc.createTextNode(link.textContent);
 469 |               link.parentNode.replaceChild(text, link);
 470 |             } else {
 471 |               // if the link has multiple children, they should all be preserved
 472 |               var container = this._doc.createElement('span');
 473 |               while (link.firstChild) {
 474 |                 container.appendChild(link.firstChild);
 475 |               }
 476 |               link.parentNode.replaceChild(container, link);
 477 |             }
 478 |           } else {
 479 |             link.setAttribute('href', toAbsoluteURI(href));
 480 |           }
 481 |         }
 482 |       });
 483 | 
 484 |       var medias = this._getAllNodesWithTag(articleContent, [
 485 |         'img',
 486 |         'picture',
 487 |         'figure',
 488 |         'video',
 489 |         'audio',
 490 |         'source',
 491 |       ]);
 492 | 
 493 |       this._forEachNode(medias, function (media) {
 494 |         var src = media.getAttribute('src');
 495 |         var poster = media.getAttribute('poster');
 496 |         var srcset = media.getAttribute('srcset');
 497 | 
 498 |         if (src) {
 499 |           media.setAttribute('src', toAbsoluteURI(src));
 500 |         }
 501 | 
 502 |         if (poster) {
 503 |           media.setAttribute('poster', toAbsoluteURI(poster));
 504 |         }
 505 | 
 506 |         if (srcset) {
 507 |           var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function (_, p1, p2, p3) {
 508 |             return toAbsoluteURI(p1) + (p2 || '') + p3;
 509 |           });
 510 | 
 511 |           media.setAttribute('srcset', newSrcset);
 512 |         }
 513 |       });
 514 |     },
 515 | 
 516 |     _simplifyNestedElements(articleContent) {
 517 |       var node = articleContent;
 518 | 
 519 |       while (node) {
 520 |         if (
 521 |           node.parentNode &&
 522 |           ['DIV', 'SECTION'].includes(node.tagName) &&
 523 |           !(node.id && node.id.startsWith('readability'))
 524 |         ) {
 525 |           if (this._isElementWithoutContent(node)) {
 526 |             node = this._removeAndGetNext(node);
 527 |             continue;
 528 |           } else if (
 529 |             this._hasSingleTagInsideElement(node, 'DIV') ||
 530 |             this._hasSingleTagInsideElement(node, 'SECTION')
 531 |           ) {
 532 |             var child = node.children[0];
 533 |             for (var i = 0; i < node.attributes.length; i++) {
 534 |               child.setAttributeNode(node.attributes[i].cloneNode());
 535 |             }
 536 |             node.parentNode.replaceChild(child, node);
 537 |             node = child;
 538 |             continue;
 539 |           }
 540 |         }
 541 | 
 542 |         node = this._getNextNode(node);
 543 |       }
 544 |     },
 545 | 
 546 |     /**
 547 |      * Get the article title as an H1.
 548 |      *
 549 |      * @return string
 550 |      **/
 551 |     _getArticleTitle() {
 552 |       var doc = this._doc;
 553 |       var curTitle = '';
 554 |       var origTitle = '';
 555 | 
 556 |       try {
 557 |         curTitle = origTitle = doc.title.trim();
 558 | 
 559 |         // If they had an element with id "title" in their HTML
 560 |         if (typeof curTitle !== 'string') {
 561 |           curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
 562 |         }
 563 |       } catch (e) {
 564 |         /* ignore exceptions setting the title. */
 565 |       }
 566 | 
 567 |       var titleHadHierarchicalSeparators = false;
 568 |       function wordCount(str) {
 569 |         return str.split(/\s+/).length;
 570 |       }
 571 | 
 572 |       // If there's a separator in the title, first remove the final part
 573 |       if (/ [\|\-\\\/>»] /.test(curTitle)) {
 574 |         titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
 575 |         let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi));
 576 |         curTitle = origTitle.substring(0, allSeparators.pop().index);
 577 | 
 578 |         // If the resulting title is too short, remove the first part instead:
 579 |         if (wordCount(curTitle) < 3) {
 580 |           curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, '');
 581 |         }
 582 |       } else if (curTitle.includes(': ')) {
 583 |         // Check if we have an heading containing this exact string, so we
 584 |         // could assume it's the full title.
 585 |         var headings = this._getAllNodesWithTag(doc, ['h1', 'h2']);
 586 |         var trimmedTitle = curTitle.trim();
 587 |         var match = this._someNode(headings, function (heading) {
 588 |           return heading.textContent.trim() === trimmedTitle;
 589 |         });
 590 | 
 591 |         // If we don't, let's extract the title out of the original title string.
 592 |         if (!match) {
 593 |           curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
 594 | 
 595 |           // If the title is now too short, try the first colon instead:
 596 |           if (wordCount(curTitle) < 3) {
 597 |             curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
 598 |             // But if we have too many words before the colon there's something weird
 599 |             // with the titles and the H tags so let's just use the original title instead
 600 |           } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
 601 |             curTitle = origTitle;
 602 |           }
 603 |         }
 604 |       } else if (curTitle.length > 150 || curTitle.length < 15) {
 605 |         var hOnes = doc.getElementsByTagName('h1');
 606 | 
 607 |         if (hOnes.length === 1) {
 608 |           curTitle = this._getInnerText(hOnes[0]);
 609 |         }
 610 |       }
 611 | 
 612 |       curTitle = curTitle.trim().replace(this.REGEXPS.normalize, ' ');
 613 |       // If we now have 4 words or fewer as our title, and either no
 614 |       // 'hierarchical' separators (\, /, > or ») were found in the original
 615 |       // title or we decreased the number of words by more than 1 word, use
 616 |       // the original title.
 617 |       var curTitleWordCount = wordCount(curTitle);
 618 |       if (
 619 |         curTitleWordCount <= 4 &&
 620 |         (!titleHadHierarchicalSeparators ||
 621 |           curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, '')) - 1)
 622 |       ) {
 623 |         curTitle = origTitle;
 624 |       }
 625 | 
 626 |       return curTitle;
 627 |     },
 628 | 
 629 |     /**
 630 |      * Prepare the HTML document for readability to scrape it.
 631 |      * This includes things like stripping javascript, CSS, and handling terrible markup.
 632 |      *
 633 |      * @return void
 634 |      **/
 635 |     _prepDocument() {
 636 |       var doc = this._doc;
 637 | 
 638 |       // Remove all style tags in head
 639 |       this._removeNodes(this._getAllNodesWithTag(doc, ['style']));
 640 | 
 641 |       if (doc.body) {
 642 |         this._replaceBrs(doc.body);
 643 |       }
 644 | 
 645 |       this._replaceNodeTags(this._getAllNodesWithTag(doc, ['font']), 'SPAN');
 646 |     },
 647 | 
 648 |     /**
 649 |      * Finds the next node, starting from the given node, and ignoring
 650 |      * whitespace in between. If the given node is an element, the same node is
 651 |      * returned.
 652 |      */
 653 |     _nextNode(node) {
 654 |       var next = node;
 655 |       while (
 656 |         next &&
 657 |         next.nodeType != this.ELEMENT_NODE &&
 658 |         this.REGEXPS.whitespace.test(next.textContent)
 659 |       ) {
 660 |         next = next.nextSibling;
 661 |       }
 662 |       return next;
 663 |     },
 664 | 
 665 |     /**
 666 |      * Replaces 2 or more successive <br> elements with a single <p>.
 667 |      * Whitespace between <br> elements are ignored. For example:
 668 |      *   <div>foo<br>bar<br> <br><br>abc</div>
 669 |      * will become:
 670 |      *   <div>foo<br>bar<p>abc</p></div>
 671 |      */
 672 |     _replaceBrs(elem) {
 673 |       this._forEachNode(this._getAllNodesWithTag(elem, ['br']), function (br) {
 674 |         var next = br.nextSibling;
 675 | 
 676 |         // Whether 2 or more <br> elements have been found and replaced with a
 677 |         // <p> block.
 678 |         var replaced = false;
 679 | 
 680 |         // If we find a <br> chain, remove the <br>s until we hit another node
 681 |         // or non-whitespace. This leaves behind the first <br> in the chain
 682 |         // (which will be replaced with a <p> later).
 683 |         while ((next = this._nextNode(next)) && next.tagName == 'BR') {
 684 |           replaced = true;
 685 |           var brSibling = next.nextSibling;
 686 |           next.remove();
 687 |           next = brSibling;
 688 |         }
 689 | 
 690 |         // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
 691 |         // all sibling nodes as children of the <p> until we hit another <br>
 692 |         // chain.
 693 |         if (replaced) {
 694 |           var p = this._doc.createElement('p');
 695 |           br.parentNode.replaceChild(p, br);
 696 | 
 697 |           next = p.nextSibling;
 698 |           while (next) {
 699 |             // If we've hit another <br><br>, we're done adding children to this <p>.
 700 |             if (next.tagName == 'BR') {
 701 |               var nextElem = this._nextNode(next.nextSibling);
 702 |               if (nextElem && nextElem.tagName == 'BR') {
 703 |                 break;
 704 |               }
 705 |             }
 706 | 
 707 |             if (!this._isPhrasingContent(next)) {
 708 |               break;
 709 |             }
 710 | 
 711 |             // Otherwise, make this node a child of the new <p>.
 712 |             var sibling = next.nextSibling;
 713 |             p.appendChild(next);
 714 |             next = sibling;
 715 |           }
 716 | 
 717 |           while (p.lastChild && this._isWhitespace(p.lastChild)) {
 718 |             p.lastChild.remove();
 719 |           }
 720 | 
 721 |           if (p.parentNode.tagName === 'P') {
 722 |             this._setNodeTag(p.parentNode, 'DIV');
 723 |           }
 724 |         }
 725 |       });
 726 |     },
 727 | 
 728 |     _setNodeTag(node, tag) {
 729 |       this.log('_setNodeTag', node, tag);
 730 |       if (this._docJSDOMParser) {
 731 |         node.localName = tag.toLowerCase();
 732 |         node.tagName = tag.toUpperCase();
 733 |         return node;
 734 |       }
 735 | 
 736 |       var replacement = node.ownerDocument.createElement(tag);
 737 |       while (node.firstChild) {
 738 |         replacement.appendChild(node.firstChild);
 739 |       }
 740 |       node.parentNode.replaceChild(replacement, node);
 741 |       if (node.readability) {
 742 |         replacement.readability = node.readability;
 743 |       }
 744 | 
 745 |       for (var i = 0; i < node.attributes.length; i++) {
 746 |         replacement.setAttributeNode(node.attributes[i].cloneNode());
 747 |       }
 748 |       return replacement;
 749 |     },
 750 | 
 751 |     /**
 752 |      * Prepare the article node for display. Clean out any inline styles,
 753 |      * iframes, forms, strip extraneous <p> tags, etc.
 754 |      *
 755 |      * @param Element
 756 |      * @return void
 757 |      **/
 758 |     _prepArticle(articleContent) {
 759 |       this._cleanStyles(articleContent);
 760 | 
 761 |       // Check for data tables before we continue, to avoid removing items in
 762 |       // those tables, which will often be isolated even though they're
 763 |       // visually linked to other content-ful elements (text, images, etc.).
 764 |       this._markDataTables(articleContent);
 765 | 
 766 |       this._fixLazyImages(articleContent);
 767 | 
 768 |       // Clean out junk from the article content
 769 |       this._cleanConditionally(articleContent, 'form');
 770 |       this._cleanConditionally(articleContent, 'fieldset');
 771 |       this._clean(articleContent, 'object');
 772 |       this._clean(articleContent, 'embed');
 773 |       this._clean(articleContent, 'footer');
 774 |       this._clean(articleContent, 'link');
 775 |       this._clean(articleContent, 'aside');
 776 | 
 777 |       // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
 778 |       // which means we don't remove the top candidates even they have "share".
 779 | 
 780 |       var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
 781 | 
 782 |       this._forEachNode(articleContent.children, function (topCandidate) {
 783 |         this._cleanMatchedNodes(topCandidate, function (node, matchString) {
 784 |           return (
 785 |             this.REGEXPS.shareElements.test(matchString) &&
 786 |             node.textContent.length < shareElementThreshold
 787 |           );
 788 |         });
 789 |       });
 790 | 
 791 |       this._clean(articleContent, 'iframe');
 792 |       this._clean(articleContent, 'input');
 793 |       this._clean(articleContent, 'textarea');
 794 |       this._clean(articleContent, 'select');
 795 |       this._clean(articleContent, 'button');
 796 |       this._cleanHeaders(articleContent);
 797 | 
 798 |       // Do these last as the previous stuff may have removed junk
 799 |       // that will affect these
 800 |       this._cleanConditionally(articleContent, 'table');
 801 |       this._cleanConditionally(articleContent, 'ul');
 802 |       this._cleanConditionally(articleContent, 'div');
 803 | 
 804 |       // replace H1 with H2 as H1 should be only title that is displayed separately
 805 |       this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ['h1']), 'h2');
 806 | 
 807 |       // Remove extra paragraphs
 808 |       this._removeNodes(this._getAllNodesWithTag(articleContent, ['p']), function (paragraph) {
 809 |         // At this point, nasty iframes have been removed; only embedded video
 810 |         // ones remain.
 811 |         var contentElementCount = this._getAllNodesWithTag(paragraph, [
 812 |           'img',
 813 |           'embed',
 814 |           'object',
 815 |           'iframe',
 816 |         ]).length;
 817 |         return contentElementCount === 0 && !this._getInnerText(paragraph, false);
 818 |       });
 819 | 
 820 |       this._forEachNode(this._getAllNodesWithTag(articleContent, ['br']), function (br) {
 821 |         var next = this._nextNode(br.nextSibling);
 822 |         if (next && next.tagName == 'P') {
 823 |           br.remove();
 824 |         }
 825 |       });
 826 | 
 827 |       // Remove single-cell tables
 828 |       this._forEachNode(this._getAllNodesWithTag(articleContent, ['table']), function (table) {
 829 |         var tbody = this._hasSingleTagInsideElement(table, 'TBODY')
 830 |           ? table.firstElementChild
 831 |           : table;
 832 |         if (this._hasSingleTagInsideElement(tbody, 'TR')) {
 833 |           var row = tbody.firstElementChild;
 834 |           if (this._hasSingleTagInsideElement(row, 'TD')) {
 835 |             var cell = row.firstElementChild;
 836 |             cell = this._setNodeTag(
 837 |               cell,
 838 |               this._everyNode(cell.childNodes, this._isPhrasingContent) ? 'P' : 'DIV',
 839 |             );
 840 |             table.parentNode.replaceChild(cell, table);
 841 |           }
 842 |         }
 843 |       });
 844 |     },
 845 | 
 846 |     /**
 847 |      * Initialize a node with the readability object. Also checks the
 848 |      * className/id for special names to add to its score.
 849 |      *
 850 |      * @param Element
 851 |      * @return void
 852 |      **/
 853 |     _initializeNode(node) {
 854 |       node.readability = { contentScore: 0 };
 855 | 
 856 |       switch (node.tagName) {
 857 |         case 'DIV':
 858 |           node.readability.contentScore += 5;
 859 |           break;
 860 | 
 861 |         case 'PRE':
 862 |         case 'TD':
 863 |         case 'BLOCKQUOTE':
 864 |           node.readability.contentScore += 3;
 865 |           break;
 866 | 
 867 |         case 'ADDRESS':
 868 |         case 'OL':
 869 |         case 'UL':
 870 |         case 'DL':
 871 |         case 'DD':
 872 |         case 'DT':
 873 |         case 'LI':
 874 |         case 'FORM':
 875 |           node.readability.contentScore -= 3;
 876 |           break;
 877 | 
 878 |         case 'H1':
 879 |         case 'H2':
 880 |         case 'H3':
 881 |         case 'H4':
 882 |         case 'H5':
 883 |         case 'H6':
 884 |         case 'TH':
 885 |           node.readability.contentScore -= 5;
 886 |           break;
 887 |       }
 888 | 
 889 |       node.readability.contentScore += this._getClassWeight(node);
 890 |     },
 891 | 
 892 |     _removeAndGetNext(node) {
 893 |       var nextNode = this._getNextNode(node, true);
 894 |       node.remove();
 895 |       return nextNode;
 896 |     },
 897 | 
 898 |     /**
 899 |      * Traverse the DOM from node to node, starting at the node passed in.
 900 |      * Pass true for the second parameter to indicate this node itself
 901 |      * (and its kids) are going away, and we want the next node over.
 902 |      *
 903 |      * Calling this in a loop will traverse the DOM depth-first.
 904 |      *
 905 |      * @param {Element} node
 906 |      * @param {boolean} ignoreSelfAndKids
 907 |      * @return {Element}
 908 |      */
 909 |     _getNextNode(node, ignoreSelfAndKids) {
 910 |       // First check for kids if those aren't being ignored
 911 |       if (!ignoreSelfAndKids && node.firstElementChild) {
 912 |         return node.firstElementChild;
 913 |       }
 914 |       // Then for siblings...
 915 |       if (node.nextElementSibling) {
 916 |         return node.nextElementSibling;
 917 |       }
 918 |       // And finally, move up the parent chain *and* find a sibling
 919 |       // (because this is depth-first traversal, we will have already
 920 |       // seen the parent nodes themselves).
 921 |       do {
 922 |         node = node.parentNode;
 923 |       } while (node && !node.nextElementSibling);
 924 |       return node && node.nextElementSibling;
 925 |     },
 926 | 
 927 |     // compares second text to first one
 928 |     // 1 = same text, 0 = completely different text
 929 |     // works the way that it splits both texts into words and then finds words that are unique in second text
 930 |     // the result is given by the lower length of unique parts
 931 |     _textSimilarity(textA, textB) {
 932 |       var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
 933 |       var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
 934 |       if (!tokensA.length || !tokensB.length) {
 935 |         return 0;
 936 |       }
 937 |       var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token));
 938 |       var distanceB = uniqTokensB.join(' ').length / tokensB.join(' ').length;
 939 |       return 1 - distanceB;
 940 |     },
 941 | 
 942 |     /**
 943 |      * Checks whether an element node contains a valid byline
 944 |      *
 945 |      * @param node {Element}
 946 |      * @param matchString {string}
 947 |      * @return boolean
 948 |      */
 949 |     _isValidByline(node, matchString) {
 950 |       var rel = node.getAttribute('rel');
 951 |       var itemprop = node.getAttribute('itemprop');
 952 |       var bylineLength = node.textContent.trim().length;
 953 | 
 954 |       return (
 955 |         (rel === 'author' ||
 956 |           (itemprop && itemprop.includes('author')) ||
 957 |           this.REGEXPS.byline.test(matchString)) &&
 958 |         !!bylineLength &&
 959 |         bylineLength < 100
 960 |       );
 961 |     },
 962 | 
 963 |     _getNodeAncestors(node, maxDepth) {
 964 |       maxDepth = maxDepth || 0;
 965 |       var i = 0,
 966 |         ancestors = [];
 967 |       while (node.parentNode) {
 968 |         ancestors.push(node.parentNode);
 969 |         if (maxDepth && ++i === maxDepth) {
 970 |           break;
 971 |         }
 972 |         node = node.parentNode;
 973 |       }
 974 |       return ancestors;
 975 |     },
 976 | 
 977 |     /***
 978 |      * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
 979 |      *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
 980 |      *
 981 |      * @param page a document to run upon. Needs to be a full document, complete with body.
 982 |      * @return Element
 983 |      **/
 984 | 
 985 |     _grabArticle(page) {
 986 |       this.log('**** grabArticle ****');
 987 |       var doc = this._doc;
 988 |       var isPaging = page !== null;
 989 |       page = page ? page : this._doc.body;
 990 | 
 991 |       // We can't grab an article if we don't have a page!
 992 |       if (!page) {
 993 |         this.log('No body found in document. Abort.');
 994 |         return null;
 995 |       }
 996 | 
 997 |       var pageCacheHtml = page.innerHTML;
 998 | 
 999 |       while (true) {
1000 |         this.log('Starting grabArticle loop');
1001 |         var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
1002 | 
1003 |         // First, node prepping. Trash nodes that look cruddy (like ones with the
1004 |         // class name "comment", etc), and turn divs into P tags where they have been
1005 |         // used inappropriately (as in, where they contain no other block level elements.)
1006 |         var elementsToScore = [];
1007 |         var node = this._doc.documentElement;
1008 | 
1009 |         let shouldRemoveTitleHeader = true;
1010 | 
1011 |         while (node) {
1012 |           if (node.tagName === 'HTML') {
1013 |             this._articleLang = node.getAttribute('lang');
1014 |           }
1015 | 
1016 |           var matchString = node.className + ' ' + node.id;
1017 | 
1018 |           if (!this._isProbablyVisible(node)) {
1019 |             this.log('Removing hidden node - ' + matchString);
1020 |             node = this._removeAndGetNext(node);
1021 |             continue;
1022 |           }
1023 | 
1024 |           // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
1025 |           if (node.getAttribute('aria-modal') == 'true' && node.getAttribute('role') == 'dialog') {
1026 |             node = this._removeAndGetNext(node);
1027 |             continue;
1028 |           }
1029 | 
1030 |           // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node.
1031 |           if (
1032 |             !this._articleByline &&
1033 |             !this._metadata.byline &&
1034 |             this._isValidByline(node, matchString)
1035 |           ) {
1036 |             // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline
1037 |             var endOfSearchMarkerNode = this._getNextNode(node, true);
1038 |             var next = this._getNextNode(node);
1039 |             var itemPropNameNode = null;
1040 |             while (next && next != endOfSearchMarkerNode) {
1041 |               var itemprop = next.getAttribute('itemprop');
1042 |               if (itemprop && itemprop.includes('name')) {
1043 |                 itemPropNameNode = next;
1044 |                 break;
1045 |               } else {
1046 |                 next = this._getNextNode(next);
1047 |               }
1048 |             }
1049 |             this._articleByline = (itemPropNameNode ?? node).textContent.trim();
1050 |             node = this._removeAndGetNext(node);
1051 |             continue;
1052 |           }
1053 | 
1054 |           if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
1055 |             this.log('Removing header: ', node.textContent.trim(), this._articleTitle.trim());
1056 |             shouldRemoveTitleHeader = false;
1057 |             node = this._removeAndGetNext(node);
1058 |             continue;
1059 |           }
1060 | 
1061 |           // Remove unlikely candidates
1062 |           if (stripUnlikelyCandidates) {
1063 |             if (
1064 |               this.REGEXPS.unlikelyCandidates.test(matchString) &&
1065 |               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
1066 |               !this._hasAncestorTag(node, 'table') &&
1067 |               !this._hasAncestorTag(node, 'code') &&
1068 |               node.tagName !== 'BODY' &&
1069 |               node.tagName !== 'A'
1070 |             ) {
1071 |               this.log('Removing unlikely candidate - ' + matchString);
1072 |               node = this._removeAndGetNext(node);
1073 |               continue;
1074 |             }
1075 | 
1076 |             if (this.UNLIKELY_ROLES.includes(node.getAttribute('role'))) {
1077 |               this.log(
1078 |                 'Removing content with role ' + node.getAttribute('role') + ' - ' + matchString,
1079 |               );
1080 |               node = this._removeAndGetNext(node);
1081 |               continue;
1082 |             }
1083 |           }
1084 | 
1085 |           // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
1086 |           if (
1087 |             (node.tagName === 'DIV' ||
1088 |               node.tagName === 'SECTION' ||
1089 |               node.tagName === 'HEADER' ||
1090 |               node.tagName === 'H1' ||
1091 |               node.tagName === 'H2' ||
1092 |               node.tagName === 'H3' ||
1093 |               node.tagName === 'H4' ||
1094 |               node.tagName === 'H5' ||
1095 |               node.tagName === 'H6') &&
1096 |             this._isElementWithoutContent(node)
1097 |           ) {
1098 |             node = this._removeAndGetNext(node);
1099 |             continue;
1100 |           }
1101 | 
1102 |           if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
1103 |             elementsToScore.push(node);
1104 |           }
1105 | 
1106 |           // Turn all divs that don't have children block level elements into p's
1107 |           if (node.tagName === 'DIV') {
1108 |             // Put phrasing content into paragraphs.
1109 |             var p = null;
1110 |             var childNode = node.firstChild;
1111 |             while (childNode) {
1112 |               var nextSibling = childNode.nextSibling;
1113 |               if (this._isPhrasingContent(childNode)) {
1114 |                 if (p !== null) {
1115 |                   p.appendChild(childNode);
1116 |                 } else if (!this._isWhitespace(childNode)) {
1117 |                   p = doc.createElement('p');
1118 |                   node.replaceChild(p, childNode);
1119 |                   p.appendChild(childNode);
1120 |                 }
1121 |               } else if (p !== null) {
1122 |                 while (p.lastChild && this._isWhitespace(p.lastChild)) {
1123 |                   p.lastChild.remove();
1124 |                 }
1125 |                 p = null;
1126 |               }
1127 |               childNode = nextSibling;
1128 |             }
1129 | 
1130 |             // Sites like http://mobile.slate.com encloses each paragraph with a DIV
1131 |             // element. DIVs with only a P element inside and no text content can be
1132 |             // safely converted into plain P elements to avoid confusing the scoring
1133 |             // algorithm with DIVs with are, in practice, paragraphs.
1134 |             if (this._hasSingleTagInsideElement(node, 'P') && this._getLinkDensity(node) < 0.25) {
1135 |               var newNode = node.children[0];
1136 |               node.parentNode.replaceChild(newNode, node);
1137 |               node = newNode;
1138 |               elementsToScore.push(node);
1139 |             } else if (!this._hasChildBlockElement(node)) {
1140 |               node = this._setNodeTag(node, 'P');
1141 |               elementsToScore.push(node);
1142 |             }
1143 |           }
1144 |           node = this._getNextNode(node);
1145 |         }
1146 | 
1147 |         /**
1148 |          * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1149 |          * Then add their score to their parent node.
1150 |          *
1151 |          * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1152 |          **/
1153 |         var candidates = [];
1154 |         this._forEachNode(elementsToScore, function (elementToScore) {
1155 |           if (
1156 |             !elementToScore.parentNode ||
1157 |             typeof elementToScore.parentNode.tagName === 'undefined'
1158 |           ) {
1159 |             return;
1160 |           }
1161 | 
1162 |           // If this paragraph is less than 25 characters, don't even count it.
1163 |           var innerText = this._getInnerText(elementToScore);
1164 |           if (innerText.length < 25) {
1165 |             return;
1166 |           }
1167 | 
1168 |           // Exclude nodes with no ancestor.
1169 |           var ancestors = this._getNodeAncestors(elementToScore, 5);
1170 |           if (ancestors.length === 0) {
1171 |             return;
1172 |           }
1173 | 
1174 |           var contentScore = 0;
1175 | 
1176 |           // Add a point for the paragraph itself as a base.
1177 |           contentScore += 1;
1178 | 
1179 |           // Add points for any commas within this paragraph.
1180 |           contentScore += innerText.split(this.REGEXPS.commas).length;
1181 | 
1182 |           // For every 100 characters in this paragraph, add another point. Up to 3 points.
1183 |           contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1184 | 
1185 |           // Initialize and score ancestors.
1186 |           this._forEachNode(ancestors, function (ancestor, level) {
1187 |             if (
1188 |               !ancestor.tagName ||
1189 |               !ancestor.parentNode ||
1190 |               typeof ancestor.parentNode.tagName === 'undefined'
1191 |             ) {
1192 |               return;
1193 |             }
1194 | 
1195 |             if (typeof ancestor.readability === 'undefined') {
1196 |               this._initializeNode(ancestor);
1197 |               candidates.push(ancestor);
1198 |             }
1199 | 
1200 |             // Node score divider:
1201 |             // - parent:             1 (no division)
1202 |             // - grandparent:        2
1203 |             // - great grandparent+: ancestor level * 3
1204 |             if (level === 0) {
1205 |               var scoreDivider = 1;
1206 |             } else if (level === 1) {
1207 |               scoreDivider = 2;
1208 |             } else {
1209 |               scoreDivider = level * 3;
1210 |             }
1211 |             ancestor.readability.contentScore += contentScore / scoreDivider;
1212 |           });
1213 |         });
1214 | 
1215 |         // After we've calculated scores, loop through all of the possible
1216 |         // candidate nodes we found and find the one with the highest score.
1217 |         var topCandidates = [];
1218 |         for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1219 |           var candidate = candidates[c];
1220 | 
1221 |           // Scale the final candidates score based on link density. Good content
1222 |           // should have a relatively small link density (5% or less) and be mostly
1223 |           // unaffected by this operation.
1224 |           var candidateScore =
1225 |             candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
1226 |           candidate.readability.contentScore = candidateScore;
1227 | 
1228 |           this.log('Candidate:', candidate, 'with score ' + candidateScore);
1229 | 
1230 |           for (var t = 0; t < this._nbTopCandidates; t++) {
1231 |             var aTopCandidate = topCandidates[t];
1232 | 
1233 |             if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
1234 |               topCandidates.splice(t, 0, candidate);
1235 |               if (topCandidates.length > this._nbTopCandidates) {
1236 |                 topCandidates.pop();
1237 |               }
1238 |               break;
1239 |             }
1240 |           }
1241 |         }
1242 | 
1243 |         var topCandidate = topCandidates[0] || null;
1244 |         var neededToCreateTopCandidate = false;
1245 |         var parentOfTopCandidate;
1246 | 
1247 |         // If we still have no top candidate, just use the body as a last resort.
1248 |         // We also have to copy the body node so it is something we can modify.
1249 |         if (topCandidate === null || topCandidate.tagName === 'BODY') {
1250 |           // Move all of the page's children into topCandidate
1251 |           topCandidate = doc.createElement('DIV');
1252 |           neededToCreateTopCandidate = true;
1253 |           // Move everything (not just elements, also text nodes etc.) into the container
1254 |           // so we even include text directly in the body:
1255 |           while (page.firstChild) {
1256 |             this.log('Moving child out:', page.firstChild);
1257 |             topCandidate.appendChild(page.firstChild);
1258 |           }
1259 | 
1260 |           page.appendChild(topCandidate);
1261 | 
1262 |           this._initializeNode(topCandidate);
1263 |         } else if (topCandidate) {
1264 |           // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1265 |           // and whose scores are quite closed with current `topCandidate` node.
1266 |           var alternativeCandidateAncestors = [];
1267 |           for (var i = 1; i < topCandidates.length; i++) {
1268 |             if (
1269 |               topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >=
1270 |               0.75
1271 |             ) {
1272 |               alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
1273 |             }
1274 |           }
1275 |           var MINIMUM_TOPCANDIDATES = 3;
1276 |           if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1277 |             parentOfTopCandidate = topCandidate.parentNode;
1278 |             while (parentOfTopCandidate && parentOfTopCandidate.tagName !== 'BODY') {
1279 |               var listsContainingThisAncestor = 0;
1280 |               for (
1281 |                 var ancestorIndex = 0;
1282 |                 ancestorIndex < alternativeCandidateAncestors.length &&
1283 |                 listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
1284 |                 ancestorIndex++
1285 |               ) {
1286 |                 listsContainingThisAncestor += Number(
1287 |                   alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate),
1288 |                 );
1289 |               }
1290 |               if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1291 |                 topCandidate = parentOfTopCandidate;
1292 |                 break;
1293 |               }
1294 |               parentOfTopCandidate = parentOfTopCandidate.parentNode;
1295 |             }
1296 |           }
1297 |           if (!topCandidate.readability) {
1298 |             this._initializeNode(topCandidate);
1299 |           }
1300 | 
1301 |           // Because of our bonus system, parents of candidates might have scores
1302 |           // themselves. They get half of the node. There won't be nodes with higher
1303 |           // scores than our topCandidate, but if we see the score going *up* in the first
1304 |           // few steps up the tree, that's a decent sign that there might be more content
1305 |           // lurking in other places that we want to unify in. The sibling stuff
1306 |           // below does some of that - but only if we've looked high enough up the DOM
1307 |           // tree.
1308 |           parentOfTopCandidate = topCandidate.parentNode;
1309 |           var lastScore = topCandidate.readability.contentScore;
1310 |           // The scores shouldn't get too low.
1311 |           var scoreThreshold = lastScore / 3;
1312 |           while (parentOfTopCandidate && parentOfTopCandidate.tagName !== 'BODY') {
1313 |             if (!parentOfTopCandidate.readability) {
1314 |               parentOfTopCandidate = parentOfTopCandidate.parentNode;
1315 |               continue;
1316 |             }
1317 |             var parentScore = parentOfTopCandidate.readability.contentScore;
1318 |             if (parentScore < scoreThreshold) {
1319 |               break;
1320 |             }
1321 |             if (parentScore > lastScore) {
1322 |               // Alright! We found a better parent to use.
1323 |               topCandidate = parentOfTopCandidate;
1324 |               break;
1325 |             }
1326 |             lastScore = parentOfTopCandidate.readability.contentScore;
1327 |             parentOfTopCandidate = parentOfTopCandidate.parentNode;
1328 |           }
1329 | 
1330 |           // If the top candidate is the only child, use parent instead. This will help sibling
1331 |           // joining logic when adjacent content is actually located in parent's sibling node.
1332 |           parentOfTopCandidate = topCandidate.parentNode;
1333 |           while (
1334 |             parentOfTopCandidate &&
1335 |             parentOfTopCandidate.tagName != 'BODY' &&
1336 |             parentOfTopCandidate.children.length == 1
1337 |           ) {
1338 |             topCandidate = parentOfTopCandidate;
1339 |             parentOfTopCandidate = topCandidate.parentNode;
1340 |           }
1341 |           if (!topCandidate.readability) {
1342 |             this._initializeNode(topCandidate);
1343 |           }
1344 |         }
1345 | 
1346 |         // Now that we have the top candidate, look through its siblings for content
1347 |         // that might also be related. Things like preambles, content split by ads
1348 |         // that we removed, etc.
1349 |         var articleContent = doc.createElement('DIV');
1350 |         if (isPaging) {
1351 |           articleContent.id = 'readability-content';
1352 |         }
1353 | 
1354 |         var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
1355 |         // Keep potential top candidate's parent node to try to get text direction of it later.
1356 |         parentOfTopCandidate = topCandidate.parentNode;
1357 |         var siblings = parentOfTopCandidate.children;
1358 | 
1359 |         for (var s = 0, sl = siblings.length; s < sl; s++) {
1360 |           var sibling = siblings[s];
1361 |           var append = false;
1362 | 
1363 |           this.log(
1364 |             'Looking at sibling node:',
1365 |             sibling,
1366 |             sibling.readability ? 'with score ' + sibling.readability.contentScore : '',
1367 |           );
1368 |           this.log(
1369 |             'Sibling has score',
1370 |             sibling.readability ? sibling.readability.contentScore : 'Unknown',
1371 |           );
1372 | 
1373 |           if (sibling === topCandidate) {
1374 |             append = true;
1375 |           } else {
1376 |             var contentBonus = 0;
1377 | 
1378 |             // Give a bonus if sibling nodes and top candidates have the example same classname
1379 |             if (sibling.className === topCandidate.className && topCandidate.className !== '') {
1380 |               contentBonus += topCandidate.readability.contentScore * 0.2;
1381 |             }
1382 | 
1383 |             if (
1384 |               sibling.readability &&
1385 |               sibling.readability.contentScore + contentBonus >= siblingScoreThreshold
1386 |             ) {
1387 |               append = true;
1388 |             } else if (sibling.nodeName === 'P') {
1389 |               var linkDensity = this._getLinkDensity(sibling);
1390 |               var nodeContent = this._getInnerText(sibling);
1391 |               var nodeLength = nodeContent.length;
1392 | 
1393 |               if (nodeLength > 80 && linkDensity < 0.25) {
1394 |                 append = true;
1395 |               } else if (
1396 |                 nodeLength < 80 &&
1397 |                 nodeLength > 0 &&
1398 |                 linkDensity === 0 &&
1399 |                 nodeContent.search(/\.( |$)/) !== -1
1400 |               ) {
1401 |                 append = true;
1402 |               }
1403 |             }
1404 |           }
1405 | 
1406 |           if (append) {
1407 |             this.log('Appending node:', sibling);
1408 | 
1409 |             if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) {
1410 |               // We have a node that isn't a common block level element, like a form or td tag.
1411 |               // Turn it into a div so it doesn't get filtered out later by accident.
1412 |               this.log('Altering sibling:', sibling, 'to div.');
1413 | 
1414 |               sibling = this._setNodeTag(sibling, 'DIV');
1415 |             }
1416 | 
1417 |             articleContent.appendChild(sibling);
1418 |             // Fetch children again to make it compatible
1419 |             // with DOM parsers without live collection support.
1420 |             siblings = parentOfTopCandidate.children;
1421 |             // siblings is a reference to the children array, and
1422 |             // sibling is removed from the array when we call appendChild().
1423 |             // As a result, we must revisit this index since the nodes
1424 |             // have been shifted.
1425 |             s -= 1;
1426 |             sl -= 1;
1427 |           }
1428 |         }
1429 | 
1430 |         if (this._debug) {
1431 |           this.log('Article content pre-prep: ' + articleContent.innerHTML);
1432 |         }
1433 |         // So we have all of the content that we need. Now we clean it up for presentation.
1434 |         this._prepArticle(articleContent);
1435 |         if (this._debug) {
1436 |           this.log('Article content post-prep: ' + articleContent.innerHTML);
1437 |         }
1438 | 
1439 |         if (neededToCreateTopCandidate) {
1440 |           // We already created a fake div thing, and there wouldn't have been any siblings left
1441 |           // for the previous loop, so there's no point trying to create a new div, and then
1442 |           // move all the children over. Just assign IDs and class names here. No need to append
1443 |           // because that already happened anyway.
1444 |           topCandidate.id = 'readability-page-1';
1445 |           topCandidate.className = 'page';
1446 |         } else {
1447 |           var div = doc.createElement('DIV');
1448 |           div.id = 'readability-page-1';
1449 |           div.className = 'page';
1450 |           while (articleContent.firstChild) {
1451 |             div.appendChild(articleContent.firstChild);
1452 |           }
1453 |           articleContent.appendChild(div);
1454 |         }
1455 | 
1456 |         if (this._debug) {
1457 |           this.log('Article content after paging: ' + articleContent.innerHTML);
1458 |         }
1459 | 
1460 |         var parseSuccessful = true;
1461 | 
1462 |         // Now that we've gone through the full algorithm, check to see if
1463 |         // we got any meaningful content. If we didn't, we may need to re-run
1464 |         // grabArticle with different flags set. This gives us a higher likelihood of
1465 |         // finding the content, and the sieve approach gives us a higher likelihood of
1466 |         // finding the -right- content.
1467 |         var textLength = this._getInnerText(articleContent, true).length;
1468 |         if (textLength < this._charThreshold) {
1469 |           parseSuccessful = false;
1470 |           // eslint-disable-next-line no-unsanitized/property
1471 |           page.innerHTML = pageCacheHtml;
1472 | 
1473 |           this._attempts.push({
1474 |             articleContent,
1475 |             textLength,
1476 |           });
1477 | 
1478 |           if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1479 |             this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1480 |           } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1481 |             this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1482 |           } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1483 |             this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1484 |           } else {
1485 |             // No luck after removing flags, just return the longest text we found during the different loops
1486 |             this._attempts.sort(function (a, b) {
1487 |               return b.textLength - a.textLength;
1488 |             });
1489 | 
1490 |             // But first check if we actually have something
1491 |             if (!this._attempts[0].textLength) {
1492 |               return null;
1493 |             }
1494 | 
1495 |             articleContent = this._attempts[0].articleContent;
1496 |             parseSuccessful = true;
1497 |           }
1498 |         }
1499 | 
1500 |         if (parseSuccessful) {
1501 |           // Find out text direction from ancestors of final top candidate.
1502 |           var ancestors = [parentOfTopCandidate, topCandidate].concat(
1503 |             this._getNodeAncestors(parentOfTopCandidate),
1504 |           );
1505 |           this._someNode(ancestors, function (ancestor) {
1506 |             if (!ancestor.tagName) {
1507 |               return false;
1508 |             }
1509 |             var articleDir = ancestor.getAttribute('dir');
1510 |             if (articleDir) {
1511 |               this._articleDir = articleDir;
1512 |               return true;
1513 |             }
1514 |             return false;
1515 |           });
1516 |           return articleContent;
1517 |         }
1518 |       }
1519 |     },
1520 | 
1521 |     /**
1522 |      * Converts some of the common HTML entities in string to their corresponding characters.
1523 |      *
1524 |      * @param str {string} - a string to unescape.
1525 |      * @return string without HTML entity.
1526 |      */
1527 |     _unescapeHtmlEntities(str) {
1528 |       if (!str) {
1529 |         return str;
1530 |       }
1531 | 
1532 |       var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1533 |       return str
1534 |         .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) {
1535 |           return htmlEscapeMap[tag];
1536 |         })
1537 |         .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) {
1538 |           var num = parseInt(hex || numStr, hex ? 16 : 10);
1539 | 
1540 |           // these character references are replaced by a conforming HTML parser
1541 |           if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) {
1542 |             num = 0xfffd;
1543 |           }
1544 | 
1545 |           return String.fromCodePoint(num);
1546 |         });
1547 |     },
1548 | 
1549 |     /**
1550 |      * Try to extract metadata from JSON-LD object.
1551 |      * For now, only Schema.org objects of type Article or its subtypes are supported.
1552 |      * @return Object with any metadata that could be extracted (possibly none)
1553 |      */
1554 |     _getJSONLD(doc) {
1555 |       var scripts = this._getAllNodesWithTag(doc, ['script']);
1556 | 
1557 |       var metadata;
1558 | 
1559 |       this._forEachNode(scripts, function (jsonLdElement) {
1560 |         if (!metadata && jsonLdElement.getAttribute('type') === 'application/ld+json') {
1561 |           try {
1562 |             // Strip CDATA markers if present
1563 |             var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, '');
1564 |             var parsed = JSON.parse(content);
1565 | 
1566 |             if (Array.isArray(parsed)) {
1567 |               parsed = parsed.find((it) => {
1568 |                 return it['@type'] && it['@type'].match(this.REGEXPS.jsonLdArticleTypes);
1569 |               });
1570 |               if (!parsed) {
1571 |                 return;
1572 |               }
1573 |             }
1574 | 
1575 |             var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/;
1576 |             var matches =
1577 |               (typeof parsed['@context'] === 'string' &&
1578 |                 parsed['@context'].match(schemaDotOrgRegex)) ||
1579 |               (typeof parsed['@context'] === 'object' &&
1580 |                 typeof parsed['@context']['@vocab'] == 'string' &&
1581 |                 parsed['@context']['@vocab'].match(schemaDotOrgRegex));
1582 | 
1583 |             if (!matches) {
1584 |               return;
1585 |             }
1586 | 
1587 |             if (!parsed['@type'] && Array.isArray(parsed['@graph'])) {
1588 |               parsed = parsed['@graph'].find((it) => {
1589 |                 return (it['@type'] || '').match(this.REGEXPS.jsonLdArticleTypes);
1590 |               });
1591 |             }
1592 | 
1593 |             if (
1594 |               !parsed ||
1595 |               !parsed['@type'] ||
1596 |               !parsed['@type'].match(this.REGEXPS.jsonLdArticleTypes)
1597 |             ) {
1598 |               return;
1599 |             }
1600 | 
1601 |             metadata = {};
1602 | 
1603 |             if (
1604 |               typeof parsed.name === 'string' &&
1605 |               typeof parsed.headline === 'string' &&
1606 |               parsed.name !== parsed.headline
1607 |             ) {
1608 |               // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1609 |               // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1610 |               // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1611 | 
1612 |               var title = this._getArticleTitle();
1613 |               var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1614 |               var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1615 | 
1616 |               if (headlineMatches && !nameMatches) {
1617 |                 metadata.title = parsed.headline;
1618 |               } else {
1619 |                 metadata.title = parsed.name;
1620 |               }
1621 |             } else if (typeof parsed.name === 'string') {
1622 |               metadata.title = parsed.name.trim();
1623 |             } else if (typeof parsed.headline === 'string') {
1624 |               metadata.title = parsed.headline.trim();
1625 |             }
1626 |             if (parsed.author) {
1627 |               if (typeof parsed.author.name === 'string') {
1628 |                 metadata.byline = parsed.author.name.trim();
1629 |               } else if (
1630 |                 Array.isArray(parsed.author) &&
1631 |                 parsed.author[0] &&
1632 |                 typeof parsed.author[0].name === 'string'
1633 |               ) {
1634 |                 metadata.byline = parsed.author
1635 |                   .filter(function (author) {
1636 |                     return author && typeof author.name === 'string';
1637 |                   })
1638 |                   .map(function (author) {
1639 |                     return author.name.trim();
1640 |                   })
1641 |                   .join(', ');
1642 |               }
1643 |             }
1644 |             if (typeof parsed.description === 'string') {
1645 |               metadata.excerpt = parsed.description.trim();
1646 |             }
1647 |             if (parsed.publisher && typeof parsed.publisher.name === 'string') {
1648 |               metadata.siteName = parsed.publisher.name.trim();
1649 |             }
1650 |             if (typeof parsed.datePublished === 'string') {
1651 |               metadata.datePublished = parsed.datePublished.trim();
1652 |             }
1653 |           } catch (err) {
1654 |             this.log(err.message);
1655 |           }
1656 |         }
1657 |       });
1658 |       return metadata ? metadata : {};
1659 |     },
1660 | 
1661 |     /**
1662 |      * Attempts to get excerpt and byline metadata for the article.
1663 |      *
1664 |      * @param {Object} jsonld — object containing any metadata that
1665 |      * could be extracted from JSON-LD object.
1666 |      *
1667 |      * @return Object with optional "excerpt" and "byline" properties
1668 |      */
1669 |     _getArticleMetadata(jsonld) {
1670 |       var metadata = {};
1671 |       var values = {};
1672 |       var metaElements = this._doc.getElementsByTagName('meta');
1673 | 
1674 |       // property is a space-separated list of values
1675 |       var propertyPattern =
1676 |         /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
1677 | 
1678 |       // name is a single value
1679 |       var namePattern =
1680 |         /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
1681 | 
1682 |       // Find description tags.
1683 |       this._forEachNode(metaElements, function (element) {
1684 |         var elementName = element.getAttribute('name');
1685 |         var elementProperty = element.getAttribute('property');
1686 |         var content = element.getAttribute('content');
1687 |         if (!content) {
1688 |           return;
1689 |         }
1690 |         var matches = null;
1691 |         var name = null;
1692 | 
1693 |         if (elementProperty) {
1694 |           matches = elementProperty.match(propertyPattern);
1695 |           if (matches) {
1696 |             // Convert to lowercase, and remove any whitespace
1697 |             // so we can match below.
1698 |             name = matches[0].toLowerCase().replace(/\s/g, '');
1699 |             // multiple authors
1700 |             values[name] = content.trim();
1701 |           }
1702 |         }
1703 |         if (!matches && elementName && namePattern.test(elementName)) {
1704 |           name = elementName;
1705 |           if (content) {
1706 |             // Convert to lowercase, remove any whitespace, and convert dots
1707 |             // to colons so we can match below.
1708 |             name = name.toLowerCase().replace(/\s/g, '').replace(/\./g, ':');
1709 |             values[name] = content.trim();
1710 |           }
1711 |         }
1712 |       });
1713 | 
1714 |       // get title
1715 |       metadata.title =
1716 |         jsonld.title ||
1717 |         values['dc:title'] ||
1718 |         values['dcterm:title'] ||
1719 |         values['og:title'] ||
1720 |         values['weibo:article:title'] ||
1721 |         values['weibo:webpage:title'] ||
1722 |         values.title ||
1723 |         values['twitter:title'] ||
1724 |         values['parsely-title'];
1725 | 
1726 |       if (!metadata.title) {
1727 |         metadata.title = this._getArticleTitle();
1728 |       }
1729 | 
1730 |       const articleAuthor =
1731 |         typeof values['article:author'] === 'string' && !this._isUrl(values['article:author'])
1732 |           ? values['article:author']
1733 |           : undefined;
1734 | 
1735 |       // get author
1736 |       metadata.byline =
1737 |         jsonld.byline ||
1738 |         values['dc:creator'] ||
1739 |         values['dcterm:creator'] ||
1740 |         values.author ||
1741 |         values['parsely-author'] ||
1742 |         articleAuthor;
1743 | 
1744 |       // get description
1745 |       metadata.excerpt =
1746 |         jsonld.excerpt ||
1747 |         values['dc:description'] ||
1748 |         values['dcterm:description'] ||
1749 |         values['og:description'] ||
1750 |         values['weibo:article:description'] ||
1751 |         values['weibo:webpage:description'] ||
1752 |         values.description ||
1753 |         values['twitter:description'];
1754 | 
1755 |       // get site name
1756 |       metadata.siteName = jsonld.siteName || values['og:site_name'];
1757 | 
1758 |       // get article published time
1759 |       metadata.publishedTime =
1760 |         jsonld.datePublished ||
1761 |         values['article:published_time'] ||
1762 |         values['parsely-pub-date'] ||
1763 |         null;
1764 | 
1765 |       // in many sites the meta value is escaped with HTML entities,
1766 |       // so here we need to unescape it
1767 |       metadata.title = this._unescapeHtmlEntities(metadata.title);
1768 |       metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1769 |       metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1770 |       metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1771 |       metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1772 | 
1773 |       return metadata;
1774 |     },
1775 | 
1776 |     /**
1777 |      * Check if node is image, or if node contains exactly only one image
1778 |      * whether as a direct child or as its descendants.
1779 |      *
1780 |      * @param Element
1781 |      **/
1782 |     _isSingleImage(node) {
1783 |       while (node) {
1784 |         if (node.tagName === 'IMG') {
1785 |           return true;
1786 |         }
1787 |         if (node.children.length !== 1 || node.textContent.trim() !== '') {
1788 |           return false;
1789 |         }
1790 |         node = node.children[0];
1791 |       }
1792 |       return false;
1793 |     },
1794 | 
1795 |     /**
1796 |      * Find all <noscript> that are located after <img> nodes, and which contain only one
1797 |      * <img> element. Replace the first image with the image from inside the <noscript> tag,
1798 |      * and remove the <noscript> tag. This improves the quality of the images we use on
1799 |      * some sites (e.g. Medium).
1800 |      *
1801 |      * @param Element
1802 |      **/
1803 |     _unwrapNoscriptImages(doc) {
1804 |       // Find img without source or attributes that might contains image, and remove it.
1805 |       // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1806 |       var imgs = Array.from(doc.getElementsByTagName('img'));
1807 |       this._forEachNode(imgs, function (img) {
1808 |         for (var i = 0; i < img.attributes.length; i++) {
1809 |           var attr = img.attributes[i];
1810 |           switch (attr.name) {
1811 |             case 'src':
1812 |             case 'srcset':
1813 |             case 'data-src':
1814 |             case 'data-srcset':
1815 |               return;
1816 |           }
1817 | 
1818 |           if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1819 |             return;
1820 |           }
1821 |         }
1822 | 
1823 |         img.remove();
1824 |       });
1825 | 
1826 |       // Next find noscript and try to extract its image
1827 |       var noscripts = Array.from(doc.getElementsByTagName('noscript'));
1828 |       this._forEachNode(noscripts, function (noscript) {
1829 |         // Parse content of noscript and make sure it only contains image
1830 |         if (!this._isSingleImage(noscript)) {
1831 |           return;
1832 |         }
1833 |         var tmp = doc.createElement('div');
1834 |         // We're running in the document context, and using unmodified
1835 |         // document contents, so doing this should be safe.
1836 |         // (Also we heavily discourage people from allowing script to
1837 |         // run at all in this document...)
1838 |         // eslint-disable-next-line no-unsanitized/property
1839 |         tmp.innerHTML = noscript.innerHTML;
1840 | 
1841 |         // If noscript has previous sibling and it only contains image,
1842 |         // replace it with noscript content. However we also keep old
1843 |         // attributes that might contains image.
1844 |         var prevElement = noscript.previousElementSibling;
1845 |         if (prevElement && this._isSingleImage(prevElement)) {
1846 |           var prevImg = prevElement;
1847 |           if (prevImg.tagName !== 'IMG') {
1848 |             prevImg = prevElement.getElementsByTagName('img')[0];
1849 |           }
1850 | 
1851 |           var newImg = tmp.getElementsByTagName('img')[0];
1852 |           for (var i = 0; i < prevImg.attributes.length; i++) {
1853 |             var attr = prevImg.attributes[i];
1854 |             if (attr.value === '') {
1855 |               continue;
1856 |             }
1857 | 
1858 |             if (
1859 |               attr.name === 'src' ||
1860 |               attr.name === 'srcset' ||
1861 |               /\.(jpg|jpeg|png|webp)/i.test(attr.value)
1862 |             ) {
1863 |               if (newImg.getAttribute(attr.name) === attr.value) {
1864 |                 continue;
1865 |               }
1866 | 
1867 |               var attrName = attr.name;
1868 |               if (newImg.hasAttribute(attrName)) {
1869 |                 attrName = 'data-old-' + attrName;
1870 |               }
1871 | 
1872 |               newImg.setAttribute(attrName, attr.value);
1873 |             }
1874 |           }
1875 | 
1876 |           noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1877 |         }
1878 |       });
1879 |     },
1880 | 
1881 |     /**
1882 |      * Removes script tags from the document.
1883 |      *
1884 |      * @param Element
1885 |      **/
1886 |     _removeScripts(doc) {
1887 |       this._removeNodes(this._getAllNodesWithTag(doc, ['script', 'noscript']));
1888 |     },
1889 | 
1890 |     /**
1891 |      * Check if this node has only whitespace and a single element with given tag
1892 |      * Returns false if the DIV node contains non-empty text nodes
1893 |      * or if it contains no element with given tag or more than 1 element.
1894 |      *
1895 |      * @param Element
1896 |      * @param string tag of child element
1897 |      **/
1898 |     _hasSingleTagInsideElement(element, tag) {
1899 |       // There should be exactly 1 element child with given tag
1900 |       if (element.children.length != 1 || element.children[0].tagName !== tag) {
1901 |         return false;
1902 |       }
1903 | 
1904 |       // And there should be no text nodes with real content
1905 |       return !this._someNode(element.childNodes, function (node) {
1906 |         return node.nodeType === this.TEXT_NODE && this.REGEXPS.hasContent.test(node.textContent);
1907 |       });
1908 |     },
1909 | 
1910 |     _isElementWithoutContent(node) {
1911 |       return (
1912 |         node.nodeType === this.ELEMENT_NODE &&
1913 |         !node.textContent.trim().length &&
1914 |         (!node.children.length ||
1915 |           node.children.length ==
1916 |             node.getElementsByTagName('br').length + node.getElementsByTagName('hr').length)
1917 |       );
1918 |     },
1919 | 
1920 |     /**
1921 |      * Determine whether element has any children block level elements.
1922 |      *
1923 |      * @param Element
1924 |      */
1925 |     _hasChildBlockElement(element) {
1926 |       return this._someNode(element.childNodes, function (node) {
1927 |         return this.DIV_TO_P_ELEMS.has(node.tagName) || this._hasChildBlockElement(node);
1928 |       });
1929 |     },
1930 | 
1931 |     /***
1932 |      * Determine if a node qualifies as phrasing content.
1933 |      * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
1934 |      **/
1935 |     _isPhrasingContent(node) {
1936 |       return (
1937 |         node.nodeType === this.TEXT_NODE ||
1938 |         this.PHRASING_ELEMS.includes(node.tagName) ||
1939 |         ((node.tagName === 'A' || node.tagName === 'DEL' || node.tagName === 'INS') &&
1940 |           this._everyNode(node.childNodes, this._isPhrasingContent))
1941 |       );
1942 |     },
1943 | 
1944 |     _isWhitespace(node) {
1945 |       return (
1946 |         (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
1947 |         (node.nodeType === this.ELEMENT_NODE && node.tagName === 'BR')
1948 |       );
1949 |     },
1950 | 
1951 |     /**
1952 |      * Get the inner text of a node - cross browser compatibly.
1953 |      * This also strips out any excess whitespace to be found.
1954 |      *
1955 |      * @param Element
1956 |      * @param Boolean normalizeSpaces (default: true)
1957 |      * @return string
1958 |      **/
1959 |     _getInnerText(e, normalizeSpaces) {
1960 |       normalizeSpaces = typeof normalizeSpaces === 'undefined' ? true : normalizeSpaces;
1961 |       var textContent = e.textContent.trim();
1962 | 
1963 |       if (normalizeSpaces) {
1964 |         return textContent.replace(this.REGEXPS.normalize, ' ');
1965 |       }
1966 |       return textContent;
1967 |     },
1968 | 
1969 |     /**
1970 |      * Get the number of times a string s appears in the node e.
1971 |      *
1972 |      * @param Element
1973 |      * @param string - what to split on. Default is ","
1974 |      * @return number (integer)
1975 |      **/
1976 |     _getCharCount(e, s) {
1977 |       s = s || ',';
1978 |       return this._getInnerText(e).split(s).length - 1;
1979 |     },
1980 | 
1981 |     /**
1982 |      * Remove the style attribute on every e and under.
1983 |      * TODO: Test if getElementsByTagName(*) is faster.
1984 |      *
1985 |      * @param Element
1986 |      * @return void
1987 |      **/
1988 |     _cleanStyles(e) {
1989 |       if (!e || e.tagName.toLowerCase() === 'svg') {
1990 |         return;
1991 |       }
1992 | 
1993 |       // Remove `style` and deprecated presentational attributes
1994 |       for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
1995 |         e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
1996 |       }
1997 | 
1998 |       if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) {
1999 |         e.removeAttribute('width');
2000 |         e.removeAttribute('height');
2001 |       }
2002 | 
2003 |       var cur = e.firstElementChild;
2004 |       while (cur !== null) {
2005 |         this._cleanStyles(cur);
2006 |         cur = cur.nextElementSibling;
2007 |       }
2008 |     },
2009 | 
2010 |     /**
2011 |      * Get the density of links as a percentage of the content
2012 |      * This is the amount of text that is inside a link divided by the total text in the node.
2013 |      *
2014 |      * @param Element
2015 |      * @return number (float)
2016 |      **/
2017 |     _getLinkDensity(element) {
2018 |       var textLength = this._getInnerText(element).length;
2019 |       if (textLength === 0) {
2020 |         return 0;
2021 |       }
2022 | 
2023 |       var linkLength = 0;
2024 | 
2025 |       // XXX implement _reduceNodeList?
2026 |       this._forEachNode(element.getElementsByTagName('a'), function (linkNode) {
2027 |         var href = linkNode.getAttribute('href');
2028 |         var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
2029 |         linkLength += this._getInnerText(linkNode).length * coefficient;
2030 |       });
2031 | 
2032 |       return linkLength / textLength;
2033 |     },
2034 | 
2035 |     /**
2036 |      * Get an elements class/id weight. Uses regular expressions to tell if this
2037 |      * element looks good or bad.
2038 |      *
2039 |      * @param Element
2040 |      * @return number (Integer)
2041 |      **/
2042 |     _getClassWeight(e) {
2043 |       if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
2044 |         return 0;
2045 |       }
2046 | 
2047 |       var weight = 0;
2048 | 
2049 |       // Look for a special classname
2050 |       if (typeof e.className === 'string' && e.className !== '') {
2051 |         if (this.REGEXPS.negative.test(e.className)) {
2052 |           weight -= 25;
2053 |         }
2054 | 
2055 |         if (this.REGEXPS.positive.test(e.className)) {
2056 |           weight += 25;
2057 |         }
2058 |       }
2059 | 
2060 |       // Look for a special ID
2061 |       if (typeof e.id === 'string' && e.id !== '') {
2062 |         if (this.REGEXPS.negative.test(e.id)) {
2063 |           weight -= 25;
2064 |         }
2065 | 
2066 |         if (this.REGEXPS.positive.test(e.id)) {
2067 |           weight += 25;
2068 |         }
2069 |       }
2070 | 
2071 |       return weight;
2072 |     },
2073 | 
2074 |     /**
2075 |      * Clean a node of all elements of type "tag".
2076 |      * (Unless it's a youtube/vimeo video. People love movies.)
2077 |      *
2078 |      * @param Element
2079 |      * @param string tag to clean
2080 |      * @return void
2081 |      **/
2082 |     _clean(e, tag) {
2083 |       var isEmbed = ['object', 'embed', 'iframe'].includes(tag);
2084 | 
2085 |       this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) {
2086 |         // Allow youtube and vimeo videos through as people usually want to see those.
2087 |         if (isEmbed) {
2088 |           // First, check the elements attributes to see if any of them contain youtube or vimeo
2089 |           for (var i = 0; i < element.attributes.length; i++) {
2090 |             if (this._allowedVideoRegex.test(element.attributes[i].value)) {
2091 |               return false;
2092 |             }
2093 |           }
2094 | 
2095 |           // For embed with <object> tag, check inner HTML as well.
2096 |           if (element.tagName === 'object' && this._allowedVideoRegex.test(element.innerHTML)) {
2097 |             return false;
2098 |           }
2099 |         }
2100 | 
2101 |         return true;
2102 |       });
2103 |     },
2104 | 
2105 |     /**
2106 |      * Check if a given node has one of its ancestor tag name matching the
2107 |      * provided one.
2108 |      * @param  HTMLElement node
2109 |      * @param  String      tagName
2110 |      * @param  Number      maxDepth
2111 |      * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
2112 |      * @return Boolean
2113 |      */
2114 |     _hasAncestorTag(node, tagName, maxDepth, filterFn) {
2115 |       maxDepth = maxDepth || 3;
2116 |       tagName = tagName.toUpperCase();
2117 |       var depth = 0;
2118 |       while (node.parentNode) {
2119 |         if (maxDepth > 0 && depth > maxDepth) {
2120 |           return false;
2121 |         }
2122 |         if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode))) {
2123 |           return true;
2124 |         }
2125 |         node = node.parentNode;
2126 |         depth++;
2127 |       }
2128 |       return false;
2129 |     },
2130 | 
2131 |     /**
2132 |      * Return an object indicating how many rows and columns this table has.
2133 |      */
2134 |     _getRowAndColumnCount(table) {
2135 |       var rows = 0;
2136 |       var columns = 0;
2137 |       var trs = table.getElementsByTagName('tr');
2138 |       for (var i = 0; i < trs.length; i++) {
2139 |         var rowspan = trs[i].getAttribute('rowspan') || 0;
2140 |         if (rowspan) {
2141 |           rowspan = parseInt(rowspan, 10);
2142 |         }
2143 |         rows += rowspan || 1;
2144 | 
2145 |         // Now look for column-related info
2146 |         var columnsInThisRow = 0;
2147 |         var cells = trs[i].getElementsByTagName('td');
2148 |         for (var j = 0; j < cells.length; j++) {
2149 |           var colspan = cells[j].getAttribute('colspan') || 0;
2150 |           if (colspan) {
2151 |             colspan = parseInt(colspan, 10);
2152 |           }
2153 |           columnsInThisRow += colspan || 1;
2154 |         }
2155 |         columns = Math.max(columns, columnsInThisRow);
2156 |       }
2157 |       return { rows, columns };
2158 |     },
2159 | 
2160 |     /**
2161 |      * Look for 'data' (as opposed to 'layout') tables, for which we use
2162 |      * similar checks as
2163 |      * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
2164 |      */
2165 |     _markDataTables(root) {
2166 |       var tables = root.getElementsByTagName('table');
2167 |       for (var i = 0; i < tables.length; i++) {
2168 |         var table = tables[i];
2169 |         var role = table.getAttribute('role');
2170 |         if (role == 'presentation') {
2171 |           table._readabilityDataTable = false;
2172 |           continue;
2173 |         }
2174 |         var datatable = table.getAttribute('datatable');
2175 |         if (datatable == '0') {
2176 |           table._readabilityDataTable = false;
2177 |           continue;
2178 |         }
2179 |         var summary = table.getAttribute('summary');
2180 |         if (summary) {
2181 |           table._readabilityDataTable = true;
2182 |           continue;
2183 |         }
2184 | 
2185 |         var caption = table.getElementsByTagName('caption')[0];
2186 |         if (caption && caption.childNodes.length) {
2187 |           table._readabilityDataTable = true;
2188 |           continue;
2189 |         }
2190 | 
2191 |         // If the table has a descendant with any of these tags, consider a data table:
2192 |         var dataTableDescendants = ['col', 'colgroup', 'tfoot', 'thead', 'th'];
2193 |         var descendantExists = function (tag) {
2194 |           return !!table.getElementsByTagName(tag)[0];
2195 |         };
2196 |         if (dataTableDescendants.some(descendantExists)) {
2197 |           this.log('Data table because found data-y descendant');
2198 |           table._readabilityDataTable = true;
2199 |           continue;
2200 |         }
2201 | 
2202 |         // Nested tables indicate a layout table:
2203 |         if (table.getElementsByTagName('table')[0]) {
2204 |           table._readabilityDataTable = false;
2205 |           continue;
2206 |         }
2207 | 
2208 |         var sizeInfo = this._getRowAndColumnCount(table);
2209 | 
2210 |         if (sizeInfo.columns == 1 || sizeInfo.rows == 1) {
2211 |           // single colum/row tables are commonly used for page layout purposes.
2212 |           table._readabilityDataTable = false;
2213 |           continue;
2214 |         }
2215 | 
2216 |         if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
2217 |           table._readabilityDataTable = true;
2218 |           continue;
2219 |         }
2220 |         // Now just go by size entirely:
2221 |         table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
2222 |       }
2223 |     },
2224 | 
2225 |     /* convert images and figures that have properties like data-src into images that can be loaded without JS */
2226 |     _fixLazyImages(root) {
2227 |       this._forEachNode(
2228 |         this._getAllNodesWithTag(root, ['img', 'picture', 'figure']),
2229 |         function (elem) {
2230 |           // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
2231 |           // So, here we check if the data uri is too short, just might as well remove it.
2232 |           if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
2233 |             // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
2234 |             var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
2235 |             if (parts[1] === 'image/svg+xml') {
2236 |               return;
2237 |             }
2238 | 
2239 |             // Make sure this element has other attributes which contains image.
2240 |             // If it doesn't, then this src is important and shouldn't be removed.
2241 |             var srcCouldBeRemoved = false;
2242 |             for (var i = 0; i < elem.attributes.length; i++) {
2243 |               var attr = elem.attributes[i];
2244 |               if (attr.name === 'src') {
2245 |                 continue;
2246 |               }
2247 | 
2248 |               if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2249 |                 srcCouldBeRemoved = true;
2250 |                 break;
2251 |               }
2252 |             }
2253 | 
2254 |             // Here we assume if image is less than 100 bytes (or 133 after encoded to base64)
2255 |             // it will be too small, therefore it might be placeholder image.
2256 |             if (srcCouldBeRemoved) {
2257 |               var b64starts = parts[0].length;
2258 |               var b64length = elem.src.length - b64starts;
2259 |               if (b64length < 133) {
2260 |                 elem.removeAttribute('src');
2261 |               }
2262 |             }
2263 |           }
2264 | 
2265 |           // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2266 |           if (
2267 |             (elem.src || (elem.srcset && elem.srcset != 'null')) &&
2268 |             !elem.className.toLowerCase().includes('lazy')
2269 |           ) {
2270 |             return;
2271 |           }
2272 | 
2273 |           for (var j = 0; j < elem.attributes.length; j++) {
2274 |             attr = elem.attributes[j];
2275 |             if (attr.name === 'src' || attr.name === 'srcset' || attr.name === 'alt') {
2276 |               continue;
2277 |             }
2278 |             var copyTo = null;
2279 |             if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
2280 |               copyTo = 'srcset';
2281 |             } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
2282 |               copyTo = 'src';
2283 |             }
2284 |             if (copyTo) {
2285 |               //if this is an img or picture, set the attribute directly
2286 |               if (elem.tagName === 'IMG' || elem.tagName === 'PICTURE') {
2287 |                 elem.setAttribute(copyTo, attr.value);
2288 |               } else if (
2289 |                 elem.tagName === 'FIGURE' &&
2290 |                 !this._getAllNodesWithTag(elem, ['img', 'picture']).length
2291 |               ) {
2292 |                 //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2293 |                 //see the nytimes-3 testcase for an example
2294 |                 var img = this._doc.createElement('img');
2295 |                 img.setAttribute(copyTo, attr.value);
2296 |                 elem.appendChild(img);
2297 |               }
2298 |             }
2299 |           }
2300 |         },
2301 |       );
2302 |     },
2303 | 
2304 |     _getTextDensity(e, tags) {
2305 |       var textLength = this._getInnerText(e, true).length;
2306 |       if (textLength === 0) {
2307 |         return 0;
2308 |       }
2309 |       var childrenLength = 0;
2310 |       var children = this._getAllNodesWithTag(e, tags);
2311 |       this._forEachNode(
2312 |         children,
2313 |         (child) => (childrenLength += this._getInnerText(child, true).length),
2314 |       );
2315 |       return childrenLength / textLength;
2316 |     },
2317 | 
2318 |     /**
2319 |      * Clean an element of all tags of type "tag" if they look fishy.
2320 |      * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2321 |      *
2322 |      * @return void
2323 |      **/
2324 |     _cleanConditionally(e, tag) {
2325 |       if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
2326 |         return;
2327 |       }
2328 | 
2329 |       // Gather counts for other typical elements embedded within.
2330 |       // Traverse backwards so we can remove nodes at the same time
2331 |       // without effecting the traversal.
2332 |       //
2333 |       // TODO: Consider taking into account original contentScore here.
2334 |       this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
2335 |         // First check if this node IS data table, in which case don't remove it.
2336 |         var isDataTable = function (t) {
2337 |           return t._readabilityDataTable;
2338 |         };
2339 | 
2340 |         var isList = tag === 'ul' || tag === 'ol';
2341 |         if (!isList) {
2342 |           var listLength = 0;
2343 |           var listNodes = this._getAllNodesWithTag(node, ['ul', 'ol']);
2344 |           this._forEachNode(listNodes, (list) => (listLength += this._getInnerText(list).length));
2345 |           isList = listLength / this._getInnerText(node).length > 0.9;
2346 |         }
2347 | 
2348 |         if (tag === 'table' && isDataTable(node)) {
2349 |           return false;
2350 |         }
2351 | 
2352 |         // Next check if we're inside a data table, in which case don't remove it as well.
2353 |         if (this._hasAncestorTag(node, 'table', -1, isDataTable)) {
2354 |           return false;
2355 |         }
2356 | 
2357 |         if (this._hasAncestorTag(node, 'code')) {
2358 |           return false;
2359 |         }
2360 | 
2361 |         // keep element if it has a data tables
2362 |         if ([...node.getElementsByTagName('table')].some((tbl) => tbl._readabilityDataTable)) {
2363 |           return false;
2364 |         }
2365 | 
2366 |         var weight = this._getClassWeight(node);
2367 | 
2368 |         this.log('Cleaning Conditionally', node);
2369 | 
2370 |         var contentScore = 0;
2371 | 
2372 |         if (weight + contentScore < 0) {
2373 |           return true;
2374 |         }
2375 | 
2376 |         if (this._getCharCount(node, ',') < 10) {
2377 |           // If there are not very many commas, and the number of
2378 |           // non-paragraph elements is more than paragraphs or other
2379 |           // ominous signs, remove the element.
2380 |           var p = node.getElementsByTagName('p').length;
2381 |           var img = node.getElementsByTagName('img').length;
2382 |           var li = node.getElementsByTagName('li').length - 100;
2383 |           var input = node.getElementsByTagName('input').length;
2384 |           var headingDensity = this._getTextDensity(node, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);
2385 | 
2386 |           var embedCount = 0;
2387 |           var embeds = this._getAllNodesWithTag(node, ['object', 'embed', 'iframe']);
2388 | 
2389 |           for (var i = 0; i < embeds.length; i++) {
2390 |             // If this embed has attribute that matches video regex, don't delete it.
2391 |             for (var j = 0; j < embeds[i].attributes.length; j++) {
2392 |               if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2393 |                 return false;
2394 |               }
2395 |             }
2396 | 
2397 |             // For embed with <object> tag, check inner HTML as well.
2398 |             if (
2399 |               embeds[i].tagName === 'object' &&
2400 |               this._allowedVideoRegex.test(embeds[i].innerHTML)
2401 |             ) {
2402 |               return false;
2403 |             }
2404 | 
2405 |             embedCount++;
2406 |           }
2407 | 
2408 |           var innerText = this._getInnerText(node);
2409 | 
2410 |           // toss any node whose inner text contains nothing but suspicious words
2411 |           if (this.REGEXPS.adWords.test(innerText) || this.REGEXPS.loadingWords.test(innerText)) {
2412 |             return true;
2413 |           }
2414 | 
2415 |           var contentLength = innerText.length;
2416 |           var linkDensity = this._getLinkDensity(node);
2417 |           var textishTags = ['SPAN', 'LI', 'TD'].concat(Array.from(this.DIV_TO_P_ELEMS));
2418 |           var textDensity = this._getTextDensity(node, textishTags);
2419 |           var isFigureChild = this._hasAncestorTag(node, 'figure');
2420 | 
2421 |           // apply shadiness checks, then check for exceptions
2422 |           const shouldRemoveNode = () => {
2423 |             const errs = [];
2424 |             if (!isFigureChild && img > 1 && p / img < 0.5) {
2425 |               errs.push(`Bad p to img ratio (img=${img}, p=${p})`);
2426 |             }
2427 |             if (!isList && li > p) {
2428 |               errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`);
2429 |             }
2430 |             if (input > Math.floor(p / 3)) {
2431 |               errs.push(`Too many inputs per p. (input=${input}, p=${p})`);
2432 |             }
2433 |             if (
2434 |               !isList &&
2435 |               !isFigureChild &&
2436 |               headingDensity < 0.9 &&
2437 |               contentLength < 25 &&
2438 |               (img === 0 || img > 2) &&
2439 |               linkDensity > 0
2440 |             ) {
2441 |               errs.push(
2442 |                 `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})`,
2443 |               );
2444 |             }
2445 |             if (!isList && weight < 25 && linkDensity > 0.2 + this._linkDensityModifier) {
2446 |               errs.push(`Low weight and a little linky. (linkDensity=${linkDensity})`);
2447 |             }
2448 |             if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) {
2449 |               errs.push(`High weight and mostly links. (linkDensity=${linkDensity})`);
2450 |             }
2451 |             if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
2452 |               errs.push(
2453 |                 `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`,
2454 |               );
2455 |             }
2456 |             if (img === 0 && textDensity === 0) {
2457 |               errs.push(`No useful content. (img=${img}, textDensity=${textDensity})`);
2458 |             }
2459 | 
2460 |             if (errs.length) {
2461 |               this.log('Checks failed', errs);
2462 |               return true;
2463 |             }
2464 | 
2465 |             return false;
2466 |           };
2467 | 
2468 |           var haveToRemove = shouldRemoveNode();
2469 | 
2470 |           // Allow simple lists of images to remain in pages
2471 |           if (isList && haveToRemove) {
2472 |             for (var x = 0; x < node.children.length; x++) {
2473 |               let child = node.children[x];
2474 |               // Don't filter in lists with li's that contain more than one child
2475 |               if (child.children.length > 1) {
2476 |                 return haveToRemove;
2477 |               }
2478 |             }
2479 |             let li_count = node.getElementsByTagName('li').length;
2480 |             // Only allow the list to remain if every li contains an image
2481 |             if (img == li_count) {
2482 |               return false;
2483 |             }
2484 |           }
2485 |           return haveToRemove;
2486 |         }
2487 |         return false;
2488 |       });
2489 |     },
2490 | 
2491 |     /**
2492 |      * Clean out elements that match the specified conditions
2493 |      *
2494 |      * @param Element
2495 |      * @param Function determines whether a node should be removed
2496 |      * @return void
2497 |      **/
2498 |     _cleanMatchedNodes(e, filter) {
2499 |       var endOfSearchMarkerNode = this._getNextNode(e, true);
2500 |       var next = this._getNextNode(e);
2501 |       while (next && next != endOfSearchMarkerNode) {
2502 |         if (filter.call(this, next, next.className + ' ' + next.id)) {
2503 |           next = this._removeAndGetNext(next);
2504 |         } else {
2505 |           next = this._getNextNode(next);
2506 |         }
2507 |       }
2508 |     },
2509 | 
2510 |     /**
2511 |      * Clean out spurious headers from an Element.
2512 |      *
2513 |      * @param Element
2514 |      * @return void
2515 |      **/
2516 |     _cleanHeaders(e) {
2517 |       let headingNodes = this._getAllNodesWithTag(e, ['h1', 'h2']);
2518 |       this._removeNodes(headingNodes, function (node) {
2519 |         let shouldRemove = this._getClassWeight(node) < 0;
2520 |         if (shouldRemove) {
2521 |           this.log('Removing header with low class weight:', node);
2522 |         }
2523 |         return shouldRemove;
2524 |       });
2525 |     },
2526 | 
2527 |     /**
2528 |      * Check if this node is an H1 or H2 element whose content is mostly
2529 |      * the same as the article title.
2530 |      *
2531 |      * @param Element  the node to check.
2532 |      * @return boolean indicating whether this is a title-like header.
2533 |      */
2534 |     _headerDuplicatesTitle(node) {
2535 |       if (node.tagName != 'H1' && node.tagName != 'H2') {
2536 |         return false;
2537 |       }
2538 |       var heading = this._getInnerText(node, false);
2539 |       this.log('Evaluating similarity of header:', heading, this._articleTitle);
2540 |       return this._textSimilarity(this._articleTitle, heading) > 0.75;
2541 |     },
2542 | 
2543 |     _flagIsActive(flag) {
2544 |       return (this._flags & flag) > 0;
2545 |     },
2546 | 
2547 |     _removeFlag(flag) {
2548 |       this._flags = this._flags & ~flag;
2549 |     },
2550 | 
2551 |     _isProbablyVisible(node) {
2552 |       // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
2553 |       return (
2554 |         (!node.style || node.style.display != 'none') &&
2555 |         (!node.style || node.style.visibility != 'hidden') &&
2556 |         !node.hasAttribute('hidden') &&
2557 |         //check for "fallback-image" so that wikimedia math images are displayed
2558 |         (!node.hasAttribute('aria-hidden') ||
2559 |           node.getAttribute('aria-hidden') != 'true' ||
2560 |           (node.className && node.className.includes && node.className.includes('fallback-image')))
2561 |       );
2562 |     },
2563 | 
2564 |     /**
2565 |      * Runs readability.
2566 |      *
2567 |      * Workflow:
2568 |      *  1. Prep the document by removing script tags, css, etc.
2569 |      *  2. Build readability's DOM tree.
2570 |      *  3. Grab the article content from the current dom tree.
2571 |      *  4. Replace the current DOM tree with the new one.
2572 |      *  5. Read peacefully.
2573 |      *
2574 |      * @return void
2575 |      **/
2576 |     parse() {
2577 |       // Avoid parsing too large documents, as per configuration option
2578 |       if (this._maxElemsToParse > 0) {
2579 |         var numTags = this._doc.getElementsByTagName('*').length;
2580 |         if (numTags > this._maxElemsToParse) {
2581 |           throw new Error('Aborting parsing document; ' + numTags + ' elements found');
2582 |         }
2583 |       }
2584 | 
2585 |       // Unwrap image from noscript
2586 |       this._unwrapNoscriptImages(this._doc);
2587 | 
2588 |       // Extract JSON-LD metadata before removing scripts
2589 |       var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2590 | 
2591 |       // Remove script tags from the document.
2592 |       this._removeScripts(this._doc);
2593 | 
2594 |       this._prepDocument();
2595 | 
2596 |       var metadata = this._getArticleMetadata(jsonLd);
2597 |       this._metadata = metadata;
2598 |       this._articleTitle = metadata.title;
2599 | 
2600 |       var articleContent = this._grabArticle();
2601 |       if (!articleContent) {
2602 |         return null;
2603 |       }
2604 | 
2605 |       this.log('Grabbed: ' + articleContent.innerHTML);
2606 | 
2607 |       this._postProcessContent(articleContent);
2608 | 
2609 |       // If we haven't found an excerpt in the article's metadata, use the article's
2610 |       // first paragraph as the excerpt. This is used for displaying a preview of
2611 |       // the article's content.
2612 |       if (!metadata.excerpt) {
2613 |         var paragraphs = articleContent.getElementsByTagName('p');
2614 |         if (paragraphs.length) {
2615 |           metadata.excerpt = paragraphs[0].textContent.trim();
2616 |         }
2617 |       }
2618 | 
2619 |       var textContent = articleContent.textContent;
2620 |       return {
2621 |         title: this._articleTitle,
2622 |         byline: metadata.byline || this._articleByline,
2623 |         dir: this._articleDir,
2624 |         lang: this._articleLang,
2625 |         content: this._serializer(articleContent),
2626 |         textContent,
2627 |         length: textContent.length,
2628 |         excerpt: metadata.excerpt,
2629 |         siteName: metadata.siteName || this._articleSiteName,
2630 |         publishedTime: metadata.publishedTime,
2631 |       };
2632 |     },
2633 |   };
2634 | 
2635 |   if (typeof module === 'object') {
2636 |     /* global module */
2637 |     module.exports = Readability;
2638 |   }
2639 | 
2640 |   /**
2641 |    * Web Fetcher Helper Content Script
2642 |    * Handles fetching HTML content, text content, and interactive elements from the current page
2643 |    * Supports Readability for better content extraction
2644 |    */
2645 | 
2646 |   // Configuration
2647 |   const config = {
2648 |     // Elements that should be ignored when extracting content (used for iframe content and fallback extraction)
2649 |     ignoreElements: [
2650 |       'nav',
2651 |       'header:not(article header)',
2652 |       'footer:not(article footer)',
2653 |       'aside',
2654 |       'script',
2655 |       'style',
2656 |       'noscript',
2657 |       'iframe[src*="ads"]',
2658 |       '.cookie-notice',
2659 |       '.ad',
2660 |       '.ads',
2661 |       '.advertisement',
2662 |       '.banner',
2663 |       '.popup',
2664 |       '.modal',
2665 |       '.overlay',
2666 |       '.social-share',
2667 |       '.social-links',
2668 |       '.related-articles',
2669 |       '.comments',
2670 |     ],
2671 |     minTextLength: 20,
2672 |     maxTotalLength: 100000,
2673 |     minParagraphLength: 2,
2674 |   };
2675 | 
2676 |   // Listen for messages from the extension
2677 |   chrome.runtime.onMessage.addListener((request, _sender, sendResponse) => {
2678 |     const pingActions = ['search_tabs_content_ping', 'chrome_web_fetcher_ping'];
2679 |     // Respond to ping message
2680 |     if (pingActions.includes(request.action)) {
2681 |       sendResponse({ status: 'pong' });
2682 |       return false; // Synchronous response
2683 |     }
2684 | 
2685 |     // Get HTML content
2686 |     else if (request.action === 'getHtmlContent') {
2687 |       try {
2688 |         let rawHtml;
2689 | 
2690 |         // If selector is specified, only get content from the matching element
2691 |         if (request.selector) {
2692 |           const element = document.querySelector(request.selector);
2693 |           if (element) {
2694 |             rawHtml = element.outerHTML;
2695 |           } else {
2696 |             throw new Error(`No element found matching selector: ${request.selector}`);
2697 |           }
2698 |         } else {
2699 |           // Otherwise get the entire page content
2700 |           rawHtml = document.documentElement.outerHTML;
2701 |         }
2702 | 
2703 |         const cleanedHtml = cleanHtmlContent(rawHtml);
2704 | 
2705 |         sendResponse({
2706 |           success: true,
2707 |           htmlContent: cleanedHtml,
2708 |           selector: request.selector,
2709 |         });
2710 |       } catch (error) {
2711 |         sendResponse({
2712 |           success: false,
2713 |           error: `Failed to get HTML content: ${error.message}`,
2714 |         });
2715 |       }
2716 |     }
2717 | 
2718 |     // Get text content
2719 |     else if (request.action === 'getTextContent') {
2720 |       try {
2721 |         // If selector is specified, only get content from the matching element
2722 |         if (request.selector) {
2723 |           const element = document.querySelector(request.selector);
2724 |           if (element) {
2725 |             // Directly get the text content of the element
2726 |             const textContent = element.innerText;
2727 | 
2728 |             sendResponse({
2729 |               success: true,
2730 |               textContent: textContent,
2731 |               selector: request.selector,
2732 |             });
2733 |           } else {
2734 |             throw new Error(`No element found matching selector: ${request.selector}`);
2735 |           }
2736 |         } else {
2737 |           // Otherwise use Readability to extract the main content
2738 |           const documentClone = document.cloneNode(true);
2739 | 
2740 |           const reader = new Readability(documentClone);
2741 |           const article = reader.parse();
2742 | 
2743 |           if (article && article.textContent) {
2744 |             // Get metadata
2745 |             const metadata = extractPageMetadata();
2746 | 
2747 |             // Get iframe content if available
2748 |             const iframeContent = extractIframeContent();
2749 | 
2750 |             // Combine content
2751 |             let fullContent = article.textContent;
2752 |             if (iframeContent && iframeContent.trim().length > config.minTextLength) {
2753 |               fullContent += '\n\n--- Embedded Content ---\n\n' + iframeContent;
2754 |             }
2755 | 
2756 |             // Clean content
2757 |             fullContent = cleanContent(fullContent);
2758 | 
2759 |             sendResponse({
2760 |               success: true,
2761 |               textContent: fullContent,
2762 |               article: {
2763 |                 title: article.title,
2764 |                 byline: article.byline,
2765 |                 siteName: article.siteName,
2766 |                 excerpt: article.excerpt,
2767 |                 lang: article.lang,
2768 |                 content: article.content, // HTML content
2769 |               },
2770 |               metadata: metadata,
2771 |             });
2772 |           } else {
2773 |             // Fallback to basic extraction
2774 |             const textContent = document.body.innerText;
2775 |             sendResponse({
2776 |               success: true,
2777 |               textContent: textContent,
2778 |               fallback: true,
2779 |             });
2780 |           }
2781 |         }
2782 |       } catch (error) {
2783 |         console.error('Error extracting text content:', error);
2784 |         sendResponse({
2785 |           success: false,
2786 |           error: `Failed to extract text content: ${error.message}`,
2787 |         });
2788 |       }
2789 | 
2790 |       return true; // Async response
2791 |     }
2792 | 
2793 |     // Interactive elements feature has been removed
2794 | 
2795 |     return true; // Async response
2796 |   });
2797 | 
2798 |   /**
2799 |    * Extract metadata from the page
2800 |    * @returns {Object} - Page metadata
2801 |    */
2802 |   function extractPageMetadata() {
2803 |     const metadata = {
2804 |       title: document.title,
2805 |       description: '',
2806 |       author: '',
2807 |       keywords: '',
2808 |       published: '',
2809 |       siteName: '',
2810 |     };
2811 | 
2812 |     // Extract description
2813 |     const descriptionElement = document.querySelector(
2814 |       'meta[name="description"], meta[property="og:description"]',
2815 |     );
2816 |     if (descriptionElement) {
2817 |       metadata.description = descriptionElement.getAttribute('content') || '';
2818 |     }
2819 | 
2820 |     // Extract author
2821 |     const authorElement = document.querySelector(
2822 |       'meta[name="author"], meta[property="article:author"]',
2823 |     );
2824 |     if (authorElement) {
2825 |       metadata.author = authorElement.getAttribute('content') || '';
2826 |     }
2827 | 
2828 |     // Extract keywords
2829 |     const keywordsElement = document.querySelector('meta[name="keywords"]');
2830 |     if (keywordsElement) {
2831 |       metadata.keywords = keywordsElement.getAttribute('content') || '';
2832 |     }
2833 | 
2834 |     // Extract published date
2835 |     const publishedElement = document.querySelector(
2836 |       'meta[property="article:published_time"], time[datetime]',
2837 |     );
2838 |     if (publishedElement) {
2839 |       metadata.published =
2840 |         publishedElement.getAttribute('content') || publishedElement.getAttribute('datetime') || '';
2841 |     }
2842 | 
2843 |     // Extract site name
2844 |     const siteNameElement = document.querySelector('meta[property="og:site_name"]');
2845 |     if (siteNameElement) {
2846 |       metadata.siteName = siteNameElement.getAttribute('content') || '';
2847 |     }
2848 | 
2849 |     return metadata;
2850 |   }
2851 | 
2852 |   /**
2853 |    * Extract content from iframes
2854 |    * @returns {string} - Combined iframe content
2855 |    */
2856 |   function extractIframeContent() {
2857 |     let allIframeText = '';
2858 |     const iframes = document.querySelectorAll('iframe');
2859 | 
2860 |     for (const iframe of iframes) {
2861 |       try {
2862 |         if (isSameOrigin(iframe) && isElementVisible(iframe)) {
2863 |           const doc = iframe.contentDocument || iframe.contentWindow?.document;
2864 |           if (doc) {
2865 |             const iframeText = doc.body.innerText;
2866 |             if (iframeText && iframeText.trim().length >= config.minTextLength) {
2867 |               allIframeText += iframeText.trim() + '\n\n';
2868 |             }
2869 |           }
2870 |         }
2871 |       } catch (error) {
2872 |         console.warn(
2873 |           `Cannot access iframe content (possible cross-origin restriction): ${error.message}`,
2874 |         );
2875 |       }
2876 |     }
2877 | 
2878 |     return allIframeText.trim();
2879 |   }
2880 | 
2881 |   /**
2882 |    * Check if iframe is same origin
2883 |    * @param {HTMLIFrameElement} iframe - The iframe to check
2884 |    * @returns {boolean} - Whether the iframe is same origin
2885 |    */
2886 |   function isSameOrigin(iframe) {
2887 |     try {
2888 |       return Boolean(iframe.contentDocument || iframe.contentWindow?.document);
2889 |     } catch (e) {
2890 |       return false;
2891 |     }
2892 |   }
2893 | 
2894 |   /**
2895 |    * Clean content text
2896 |    * @param {string} text - The text to clean
2897 |    * @returns {string} - Cleaned text
2898 |    */
2899 |   function cleanContent(text) {
2900 |     return text
2901 |       .replace(/\s+/g, ' ')
2902 |       .replace(/\n\s*\n/g, '\n\n')
2903 |       .trim()
2904 |       .substring(0, config.maxTotalLength);
2905 |   }
2906 | 
2907 |   /**
2908 |    * Clean HTML content by removing style tags and their content
2909 |    * @param {string} html - The HTML content to clean
2910 |    * @returns {string} - Cleaned HTML content
2911 |    */
2912 |   function cleanHtmlContent(html) {
2913 |     // Create a new document parser
2914 |     const parser = new DOMParser();
2915 |     const doc = parser.parseFromString(html, 'text/html');
2916 | 
2917 |     // Remove all style tags
2918 |     const styleElements = doc.querySelectorAll('style');
2919 |     styleElements.forEach((element) => {
2920 |       if (element.parentNode) {
2921 |         element.parentNode.removeChild(element);
2922 |       }
2923 |     });
2924 | 
2925 |     // Remove all inline style attributes
2926 |     const allElementsWithStyle = doc.querySelectorAll('*');
2927 |     allElementsWithStyle.forEach((element) => {
2928 |       element.removeAttribute('style');
2929 |     });
2930 | 
2931 |     // Remove all link tags
2932 |     const linkElements = doc.querySelectorAll('link');
2933 |     linkElements.forEach((element) => {
2934 |       if (element.parentNode) {
2935 |         element.parentNode.removeChild(element);
2936 |       }
2937 |     });
2938 | 
2939 |     // Remove all script tags
2940 |     const scriptElements = doc.querySelectorAll('script');
2941 |     scriptElements.forEach((element) => {
2942 |       if (element.parentNode) {
2943 |         element.parentNode.removeChild(element);
2944 |       }
2945 |     });
2946 | 
2947 |     // Replace all SVG elements with placeholders
2948 |     const svgElements = doc.querySelectorAll('svg');
2949 |     svgElements.forEach((element) => {
2950 |       if (element.parentNode) {
2951 |         // Create a placeholder element
2952 |         const placeholder = doc.createElement('span');
2953 |         placeholder.textContent = '[SVG Icon]';
2954 |         placeholder.setAttribute('data-placeholder', 'svg-icon');
2955 | 
2956 |         // Replace SVG element
2957 |         element.parentNode.replaceChild(placeholder, element);
2958 |       }
2959 |     });
2960 | 
2961 |     // Replace all SVG images and objects
2962 |     const svgImages = doc.querySelectorAll(
2963 |       'img[src$=".svg"], object[data$=".svg"], embed[src$=".svg"]',
2964 |     );
2965 |     svgImages.forEach((element) => {
2966 |       if (element.parentNode) {
2967 |         // Create a placeholder element
2968 |         const placeholder = doc.createElement('span');
2969 |         placeholder.textContent = '[SVG Image]';
2970 |         placeholder.setAttribute('data-placeholder', 'svg-image');
2971 |         if (element.alt) {
2972 |           placeholder.textContent = `[SVG Image: ${element.alt}]`;
2973 |         }
2974 | 
2975 |         // Replace SVG image element
2976 |         element.parentNode.replaceChild(placeholder, element);
2977 |       }
2978 |     });
2979 | 
2980 |     // Remove elements with only data-* attributes, no children, and no class or style
2981 |     const allElements = Array.from(doc.querySelectorAll('*'));
2982 |     allElements.forEach((element) => {
2983 |       // Check if element has only data-* attributes
2984 |       let hasOnlyDataAttributes = true;
2985 |       let hasDataAttribute = false;
2986 | 
2987 |       // Check all attributes
2988 |       for (let i = 0; i < element.attributes.length; i++) {
2989 |         const attr = element.attributes[i];
2990 |         if (attr.name.startsWith('data-')) {
2991 |           hasDataAttribute = true;
2992 |         } else if (attr.name !== 'id') {
2993 |           // Allow id attribute
2994 |           hasOnlyDataAttributes = false;
2995 |           break;
2996 |         }
2997 |       }
2998 | 
2999 |       // If element has only data-* attributes, no children, and no text content
3000 |       if (
3001 |         hasOnlyDataAttributes &&
3002 |         hasDataAttribute &&
3003 |         element.children.length === 0 &&
3004 |         element.textContent.trim() === ''
3005 |       ) {
3006 |         // Remove the element
3007 |         if (element.parentNode) {
3008 |           element.parentNode.removeChild(element);
3009 |         }
3010 |       }
3011 |     });
3012 | 
3013 |     // Remove all HTML comments
3014 |     const removeComments = (node) => {
3015 |       const childNodes = node.childNodes;
3016 |       for (let i = childNodes.length - 1; i >= 0; i--) {
3017 |         const child = childNodes[i];
3018 |         if (child.nodeType === 8) {
3019 |           // Comment node
3020 |           node.removeChild(child);
3021 |         } else if (child.nodeType === 1) {
3022 |           // Element node
3023 |           removeComments(child);
3024 |         }
3025 |       }
3026 |     };
3027 |     removeComments(doc);
3028 | 
3029 |     // Return cleaned HTML
3030 |     return new XMLSerializer().serializeToString(doc);
3031 |   }
3032 | 
3033 |   // Interactive elements feature has been removed
3034 | 
3035 |   // Selector generation feature has been removed
3036 | }
3037 | 
```