This is page 8 of 10. Use http://codebase.md/hangwin/mcp-chrome?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitattributes
├── .github
│ └── workflows
│ └── build-release.yml
├── .gitignore
├── .husky
│ ├── commit-msg
│ └── pre-commit
├── .prettierignore
├── .prettierrc.json
├── .vscode
│ └── extensions.json
├── app
│ ├── chrome-extension
│ │ ├── _locales
│ │ │ ├── de
│ │ │ │ └── messages.json
│ │ │ ├── en
│ │ │ │ └── messages.json
│ │ │ ├── ja
│ │ │ │ └── messages.json
│ │ │ ├── ko
│ │ │ │ └── messages.json
│ │ │ ├── zh_CN
│ │ │ │ └── messages.json
│ │ │ └── zh_TW
│ │ │ └── messages.json
│ │ ├── .env.example
│ │ ├── assets
│ │ │ └── vue.svg
│ │ ├── common
│ │ │ ├── constants.ts
│ │ │ ├── message-types.ts
│ │ │ └── tool-handler.ts
│ │ ├── entrypoints
│ │ │ ├── background
│ │ │ │ ├── index.ts
│ │ │ │ ├── native-host.ts
│ │ │ │ ├── semantic-similarity.ts
│ │ │ │ ├── storage-manager.ts
│ │ │ │ └── tools
│ │ │ │ ├── base-browser.ts
│ │ │ │ ├── browser
│ │ │ │ │ ├── bookmark.ts
│ │ │ │ │ ├── common.ts
│ │ │ │ │ ├── console.ts
│ │ │ │ │ ├── file-upload.ts
│ │ │ │ │ ├── history.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── inject-script.ts
│ │ │ │ │ ├── interaction.ts
│ │ │ │ │ ├── keyboard.ts
│ │ │ │ │ ├── network-capture-debugger.ts
│ │ │ │ │ ├── network-capture-web-request.ts
│ │ │ │ │ ├── network-request.ts
│ │ │ │ │ ├── screenshot.ts
│ │ │ │ │ ├── vector-search.ts
│ │ │ │ │ ├── web-fetcher.ts
│ │ │ │ │ └── window.ts
│ │ │ │ └── index.ts
│ │ │ ├── content.ts
│ │ │ ├── offscreen
│ │ │ │ ├── index.html
│ │ │ │ └── main.ts
│ │ │ └── popup
│ │ │ ├── App.vue
│ │ │ ├── components
│ │ │ │ ├── ConfirmDialog.vue
│ │ │ │ ├── icons
│ │ │ │ │ ├── BoltIcon.vue
│ │ │ │ │ ├── CheckIcon.vue
│ │ │ │ │ ├── DatabaseIcon.vue
│ │ │ │ │ ├── DocumentIcon.vue
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── TabIcon.vue
│ │ │ │ │ ├── TrashIcon.vue
│ │ │ │ │ └── VectorIcon.vue
│ │ │ │ ├── ModelCacheManagement.vue
│ │ │ │ └── ProgressIndicator.vue
│ │ │ ├── index.html
│ │ │ ├── main.ts
│ │ │ └── style.css
│ │ ├── eslint.config.js
│ │ ├── inject-scripts
│ │ │ ├── click-helper.js
│ │ │ ├── fill-helper.js
│ │ │ ├── inject-bridge.js
│ │ │ ├── interactive-elements-helper.js
│ │ │ ├── keyboard-helper.js
│ │ │ ├── network-helper.js
│ │ │ ├── screenshot-helper.js
│ │ │ └── web-fetcher-helper.js
│ │ ├── LICENSE
│ │ ├── package.json
│ │ ├── public
│ │ │ ├── icon
│ │ │ │ ├── 128.png
│ │ │ │ ├── 16.png
│ │ │ │ ├── 32.png
│ │ │ │ ├── 48.png
│ │ │ │ └── 96.png
│ │ │ ├── libs
│ │ │ │ └── ort.min.js
│ │ │ └── wxt.svg
│ │ ├── README.md
│ │ ├── tsconfig.json
│ │ ├── utils
│ │ │ ├── content-indexer.ts
│ │ │ ├── i18n.ts
│ │ │ ├── image-utils.ts
│ │ │ ├── lru-cache.ts
│ │ │ ├── model-cache-manager.ts
│ │ │ ├── offscreen-manager.ts
│ │ │ ├── semantic-similarity-engine.ts
│ │ │ ├── simd-math-engine.ts
│ │ │ ├── text-chunker.ts
│ │ │ └── vector-database.ts
│ │ ├── workers
│ │ │ ├── ort-wasm-simd-threaded.jsep.mjs
│ │ │ ├── ort-wasm-simd-threaded.jsep.wasm
│ │ │ ├── ort-wasm-simd-threaded.mjs
│ │ │ ├── ort-wasm-simd-threaded.wasm
│ │ │ ├── simd_math_bg.wasm
│ │ │ ├── simd_math.js
│ │ │ └── similarity.worker.js
│ │ └── wxt.config.ts
│ └── native-server
│ ├── debug.sh
│ ├── install.md
│ ├── jest.config.js
│ ├── package.json
│ ├── README.md
│ ├── src
│ │ ├── cli.ts
│ │ ├── constant
│ │ │ └── index.ts
│ │ ├── file-handler.ts
│ │ ├── index.ts
│ │ ├── mcp
│ │ │ ├── mcp-server-stdio.ts
│ │ │ ├── mcp-server.ts
│ │ │ ├── register-tools.ts
│ │ │ └── stdio-config.json
│ │ ├── native-messaging-host.ts
│ │ ├── scripts
│ │ │ ├── browser-config.ts
│ │ │ ├── build.ts
│ │ │ ├── constant.ts
│ │ │ ├── postinstall.ts
│ │ │ ├── register-dev.ts
│ │ │ ├── register.ts
│ │ │ ├── run_host.bat
│ │ │ ├── run_host.sh
│ │ │ └── utils.ts
│ │ ├── server
│ │ │ ├── index.ts
│ │ │ └── server.test.ts
│ │ └── util
│ │ └── logger.ts
│ └── tsconfig.json
├── commitlint.config.cjs
├── docs
│ ├── ARCHITECTURE_zh.md
│ ├── ARCHITECTURE.md
│ ├── CHANGELOG.md
│ ├── CONTRIBUTING_zh.md
│ ├── CONTRIBUTING.md
│ ├── TOOLS_zh.md
│ ├── TOOLS.md
│ ├── TROUBLESHOOTING_zh.md
│ ├── TROUBLESHOOTING.md
│ └── WINDOWS_INSTALL_zh.md
├── eslint.config.js
├── LICENSE
├── package.json
├── packages
│ ├── shared
│ │ ├── package.json
│ │ ├── src
│ │ │ ├── constants.ts
│ │ │ ├── index.ts
│ │ │ ├── tools.ts
│ │ │ └── types.ts
│ │ └── tsconfig.json
│ └── wasm-simd
│ ├── .gitignore
│ ├── BUILD.md
│ ├── Cargo.toml
│ ├── package.json
│ ├── README.md
│ └── src
│ └── lib.rs
├── pnpm-lock.yaml
├── pnpm-workspace.yaml
├── prompt
│ ├── content-analize.md
│ ├── excalidraw-prompt.md
│ └── modify-web.md
├── README_zh.md
├── README.md
├── releases
│ ├── chrome-extension
│ │ └── latest
│ │ └── chrome-mcp-server-lastest.zip
│ └── README.md
└── test-inject-script.js
```
# Files
--------------------------------------------------------------------------------
/app/chrome-extension/inject-scripts/web-fetcher-helper.js:
--------------------------------------------------------------------------------
```javascript
1 | /* eslint-disable */
2 |
3 | if (window.__WEB_FETCHER_HELPER_INITIALIZED__) {
4 | // Already initialized, skip
5 | } else {
6 | window.__WEB_FETCHER_HELPER_INITIALIZED__ = true;
7 |
8 | /*
9 | * Copyright (c) 2010 Arc90 Inc
10 | *
11 | * Licensed under the Apache License, Version 2.0 (the "License");
12 | * you may not use this file except in compliance with the License.
13 | * You may obtain a copy of the License at
14 | *
15 | * http://www.apache.org/licenses/LICENSE-2.0
16 | *
17 | * Unless required by applicable law or agreed to in writing, software
18 | * distributed under the License is distributed on an "AS IS" BASIS,
19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | * See the License for the specific language governing permissions and
21 | * limitations under the License.
22 | */
23 |
24 | /*
25 | * This code is heavily based on Arc90's readability.js (1.7.1) script
26 | * available at: http://code.google.com/p/arc90labs-readability
27 | */
28 |
29 | /**
30 | * Public constructor.
31 | * @param {HTMLDocument} doc The document to parse.
32 | * @param {Object} options The options object.
33 | */
34 | function Readability(doc, options) {
35 | // In some older versions, people passed a URI as the first argument. Cope:
36 | if (options && options.documentElement) {
37 | doc = options;
38 | options = arguments[2];
39 | } else if (!doc || !doc.documentElement) {
40 | throw new Error('First argument to Readability constructor should be a document object.');
41 | }
42 | options = options || {};
43 |
44 | this._doc = doc;
45 | this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
46 | this._articleTitle = null;
47 | this._articleByline = null;
48 | this._articleDir = null;
49 | this._articleSiteName = null;
50 | this._attempts = [];
51 | this._metadata = {};
52 |
53 | // Configurable options
54 | this._debug = !!options.debug;
55 | this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
56 | this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
57 | this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
58 | this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
59 | this._keepClasses = !!options.keepClasses;
60 | this._serializer =
61 | options.serializer ||
62 | function (el) {
63 | return el.innerHTML;
64 | };
65 | this._disableJSONLD = !!options.disableJSONLD;
66 | this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
67 | this._linkDensityModifier = options.linkDensityModifier || 0;
68 |
69 | // Start with all flags set
70 | this._flags =
71 | this.FLAG_STRIP_UNLIKELYS | this.FLAG_WEIGHT_CLASSES | this.FLAG_CLEAN_CONDITIONALLY;
72 |
73 | // Control whether log messages are sent to the console
74 | if (this._debug) {
75 | let logNode = function (node) {
76 | if (node.nodeType == node.TEXT_NODE) {
77 | return `${node.nodeName} ("${node.textContent}")`;
78 | }
79 | let attrPairs = Array.from(node.attributes || [], function (attr) {
80 | return `${attr.name}="${attr.value}"`;
81 | }).join(' ');
82 | return `<${node.localName} ${attrPairs}>`;
83 | };
84 | this.log = function () {
85 | if (typeof console !== 'undefined') {
86 | let args = Array.from(arguments, (arg) => {
87 | if (arg && arg.nodeType == this.ELEMENT_NODE) {
88 | return logNode(arg);
89 | }
90 | return arg;
91 | });
92 | args.unshift('Reader: (Readability)');
93 |
94 | // Debug logging removed
95 | } else if (typeof dump !== 'undefined') {
96 | /* global dump */
97 | var msg = Array.prototype.map
98 | .call(arguments, function (x) {
99 | return x && x.nodeName ? logNode(x) : x;
100 | })
101 | .join(' ');
102 | dump('Reader: (Readability) ' + msg + '\n');
103 | }
104 | };
105 | } else {
106 | this.log = function () {};
107 | }
108 | }
109 |
110 | Readability.prototype = {
111 | FLAG_STRIP_UNLIKELYS: 0x1,
112 | FLAG_WEIGHT_CLASSES: 0x2,
113 | FLAG_CLEAN_CONDITIONALLY: 0x4,
114 |
115 | // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
116 | ELEMENT_NODE: 1,
117 | TEXT_NODE: 3,
118 |
119 | // Max number of nodes supported by this parser. Default: 0 (no limit)
120 | DEFAULT_MAX_ELEMS_TO_PARSE: 0,
121 |
122 | // The number of top candidates to consider when analysing how
123 | // tight the competition is among candidates.
124 | DEFAULT_N_TOP_CANDIDATES: 5,
125 |
126 | // Element tags to score by default.
127 | DEFAULT_TAGS_TO_SCORE: 'section,h2,h3,h4,h5,h6,p,td,pre'.toUpperCase().split(','),
128 |
129 | // The default number of chars an article must have in order to return a result
130 | DEFAULT_CHAR_THRESHOLD: 500,
131 |
132 | // All of the regular expressions in use within readability.
133 | // Defined up here so we don't instantiate them repeatedly in loops.
134 | REGEXPS: {
135 | // NOTE: These two regular expressions are duplicated in
136 | // Readability-readerable.js. Please keep both copies in sync.
137 | unlikelyCandidates:
138 | /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
139 | okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
140 |
141 | positive:
142 | /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
143 | negative:
144 | /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
145 | extraneous:
146 | /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
147 | byline: /byline|author|dateline|writtenby|p-author/i,
148 | replaceFonts: /<(\/?)font[^>]*>/gi,
149 | normalize: /\s{2,}/g,
150 | videos:
151 | /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
152 | shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
153 | nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
154 | prevLink: /(prev|earl|old|new|<|«)/i,
155 | tokenize: /\W+/g,
156 | whitespace: /^\s*$/,
157 | hasContent: /\S$/,
158 | hashUrl: /^#.+/,
159 | srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
160 | b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
161 | // Commas as used in Latin, Sindhi, Chinese and various other scripts.
162 | // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
163 | commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
164 | // See: https://schema.org/Article
165 | jsonLdArticleTypes:
166 | /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
167 | // used to see if a node's content matches words commonly used for ad blocks or loading indicators
168 | adWords: /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
169 | loadingWords: /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
170 | },
171 |
172 | UNLIKELY_ROLES: [
173 | 'menu',
174 | 'menubar',
175 | 'complementary',
176 | 'navigation',
177 | 'alert',
178 | 'alertdialog',
179 | 'dialog',
180 | ],
181 |
182 | DIV_TO_P_ELEMS: new Set(['BLOCKQUOTE', 'DL', 'DIV', 'IMG', 'OL', 'P', 'PRE', 'TABLE', 'UL']),
183 |
184 | ALTER_TO_DIV_EXCEPTIONS: ['DIV', 'ARTICLE', 'SECTION', 'P', 'OL', 'UL'],
185 |
186 | PRESENTATIONAL_ATTRIBUTES: [
187 | 'align',
188 | 'background',
189 | 'bgcolor',
190 | 'border',
191 | 'cellpadding',
192 | 'cellspacing',
193 | 'frame',
194 | 'hspace',
195 | 'rules',
196 | 'style',
197 | 'valign',
198 | 'vspace',
199 | ],
200 |
201 | DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ['TABLE', 'TH', 'TD', 'HR', 'PRE'],
202 |
203 | // The commented out elements qualify as phrasing content but tend to be
204 | // removed by readability when put into paragraphs, so we ignore them here.
205 | PHRASING_ELEMS: [
206 | // "CANVAS", "IFRAME", "SVG", "VIDEO",
207 | 'ABBR',
208 | 'AUDIO',
209 | 'B',
210 | 'BDO',
211 | 'BR',
212 | 'BUTTON',
213 | 'CITE',
214 | 'CODE',
215 | 'DATA',
216 | 'DATALIST',
217 | 'DFN',
218 | 'EM',
219 | 'EMBED',
220 | 'I',
221 | 'IMG',
222 | 'INPUT',
223 | 'KBD',
224 | 'LABEL',
225 | 'MARK',
226 | 'MATH',
227 | 'METER',
228 | 'NOSCRIPT',
229 | 'OBJECT',
230 | 'OUTPUT',
231 | 'PROGRESS',
232 | 'Q',
233 | 'RUBY',
234 | 'SAMP',
235 | 'SCRIPT',
236 | 'SELECT',
237 | 'SMALL',
238 | 'SPAN',
239 | 'STRONG',
240 | 'SUB',
241 | 'SUP',
242 | 'TEXTAREA',
243 | 'TIME',
244 | 'VAR',
245 | 'WBR',
246 | ],
247 |
248 | // These are the classes that readability sets itself.
249 | CLASSES_TO_PRESERVE: ['page'],
250 |
251 | // These are the list of HTML entities that need to be escaped.
252 | HTML_ESCAPE_MAP: {
253 | lt: '<',
254 | gt: '>',
255 | amp: '&',
256 | quot: '"',
257 | apos: "'",
258 | },
259 |
260 | /**
261 | * Run any post-process modifications to article content as necessary.
262 | *
263 | * @param Element
264 | * @return void
265 | **/
266 | _postProcessContent(articleContent) {
267 | // Readability cannot open relative uris so we convert them to absolute uris.
268 | this._fixRelativeUris(articleContent);
269 |
270 | this._simplifyNestedElements(articleContent);
271 |
272 | if (!this._keepClasses) {
273 | // Remove classes.
274 | this._cleanClasses(articleContent);
275 | }
276 | },
277 |
278 | /**
279 | * Iterates over a NodeList, calls `filterFn` for each node and removes node
280 | * if function returned `true`.
281 | *
282 | * If function is not passed, removes all the nodes in node list.
283 | *
284 | * @param NodeList nodeList The nodes to operate on
285 | * @param Function filterFn the function to use as a filter
286 | * @return void
287 | */
288 | _removeNodes(nodeList, filterFn) {
289 | // Avoid ever operating on live node lists.
290 | if (this._docJSDOMParser && nodeList._isLiveNodeList) {
291 | throw new Error('Do not pass live node lists to _removeNodes');
292 | }
293 | for (var i = nodeList.length - 1; i >= 0; i--) {
294 | var node = nodeList[i];
295 | var parentNode = node.parentNode;
296 | if (parentNode) {
297 | if (!filterFn || filterFn.call(this, node, i, nodeList)) {
298 | parentNode.removeChild(node);
299 | }
300 | }
301 | }
302 | },
303 |
304 | /**
305 | * Iterates over a NodeList, and calls _setNodeTag for each node.
306 | *
307 | * @param NodeList nodeList The nodes to operate on
308 | * @param String newTagName the new tag name to use
309 | * @return void
310 | */
311 | _replaceNodeTags(nodeList, newTagName) {
312 | // Avoid ever operating on live node lists.
313 | if (this._docJSDOMParser && nodeList._isLiveNodeList) {
314 | throw new Error('Do not pass live node lists to _replaceNodeTags');
315 | }
316 | for (const node of nodeList) {
317 | this._setNodeTag(node, newTagName);
318 | }
319 | },
320 |
321 | /**
322 | * Iterate over a NodeList, which doesn't natively fully implement the Array
323 | * interface.
324 | *
325 | * For convenience, the current object context is applied to the provided
326 | * iterate function.
327 | *
328 | * @param NodeList nodeList The NodeList.
329 | * @param Function fn The iterate function.
330 | * @return void
331 | */
332 | _forEachNode(nodeList, fn) {
333 | Array.prototype.forEach.call(nodeList, fn, this);
334 | },
335 |
336 | /**
337 | * Iterate over a NodeList, and return the first node that passes
338 | * the supplied test function
339 | *
340 | * For convenience, the current object context is applied to the provided
341 | * test function.
342 | *
343 | * @param NodeList nodeList The NodeList.
344 | * @param Function fn The test function.
345 | * @return void
346 | */
347 | _findNode(nodeList, fn) {
348 | return Array.prototype.find.call(nodeList, fn, this);
349 | },
350 |
351 | /**
352 | * Iterate over a NodeList, return true if any of the provided iterate
353 | * function calls returns true, false otherwise.
354 | *
355 | * For convenience, the current object context is applied to the
356 | * provided iterate function.
357 | *
358 | * @param NodeList nodeList The NodeList.
359 | * @param Function fn The iterate function.
360 | * @return Boolean
361 | */
362 | _someNode(nodeList, fn) {
363 | return Array.prototype.some.call(nodeList, fn, this);
364 | },
365 |
366 | /**
367 | * Iterate over a NodeList, return true if all of the provided iterate
368 | * function calls return true, false otherwise.
369 | *
370 | * For convenience, the current object context is applied to the
371 | * provided iterate function.
372 | *
373 | * @param NodeList nodeList The NodeList.
374 | * @param Function fn The iterate function.
375 | * @return Boolean
376 | */
377 | _everyNode(nodeList, fn) {
378 | return Array.prototype.every.call(nodeList, fn, this);
379 | },
380 |
381 | _getAllNodesWithTag(node, tagNames) {
382 | if (node.querySelectorAll) {
383 | return node.querySelectorAll(tagNames.join(','));
384 | }
385 | return [].concat.apply(
386 | [],
387 | tagNames.map(function (tag) {
388 | var collection = node.getElementsByTagName(tag);
389 | return Array.isArray(collection) ? collection : Array.from(collection);
390 | }),
391 | );
392 | },
393 |
394 | /**
395 | * Removes the class="" attribute from every element in the given
396 | * subtree, except those that match CLASSES_TO_PRESERVE and
397 | * the classesToPreserve array from the options object.
398 | *
399 | * @param Element
400 | * @return void
401 | */
402 | _cleanClasses(node) {
403 | var classesToPreserve = this._classesToPreserve;
404 | var className = (node.getAttribute('class') || '')
405 | .split(/\s+/)
406 | .filter((cls) => classesToPreserve.includes(cls))
407 | .join(' ');
408 |
409 | if (className) {
410 | node.setAttribute('class', className);
411 | } else {
412 | node.removeAttribute('class');
413 | }
414 |
415 | for (node = node.firstElementChild; node; node = node.nextElementSibling) {
416 | this._cleanClasses(node);
417 | }
418 | },
419 |
420 | /**
421 | * Tests whether a string is a URL or not.
422 | *
423 | * @param {string} str The string to test
424 | * @return {boolean} true if str is a URL, false if not
425 | */
426 | _isUrl(str) {
427 | try {
428 | new URL(str);
429 | return true;
430 | } catch {
431 | return false;
432 | }
433 | },
434 | /**
435 | * Converts each <a> and <img> uri in the given element to an absolute URI,
436 | * ignoring #ref URIs.
437 | *
438 | * @param Element
439 | * @return void
440 | */
441 | _fixRelativeUris(articleContent) {
442 | var baseURI = this._doc.baseURI;
443 | var documentURI = this._doc.documentURI;
444 | function toAbsoluteURI(uri) {
445 | // Leave hash links alone if the base URI matches the document URI:
446 | if (baseURI == documentURI && uri.charAt(0) == '#') {
447 | return uri;
448 | }
449 |
450 | // Otherwise, resolve against base URI:
451 | try {
452 | return new URL(uri, baseURI).href;
453 | } catch (ex) {
454 | // Something went wrong, just return the original:
455 | }
456 | return uri;
457 | }
458 |
459 | var links = this._getAllNodesWithTag(articleContent, ['a']);
460 | this._forEachNode(links, function (link) {
461 | var href = link.getAttribute('href');
462 | if (href) {
463 | // Remove links with javascript: URIs, since
464 | // they won't work after scripts have been removed from the page.
465 | if (href.indexOf('javascript:') === 0) {
466 | // if the link only contains simple text content, it can be converted to a text node
467 | if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
468 | var text = this._doc.createTextNode(link.textContent);
469 | link.parentNode.replaceChild(text, link);
470 | } else {
471 | // if the link has multiple children, they should all be preserved
472 | var container = this._doc.createElement('span');
473 | while (link.firstChild) {
474 | container.appendChild(link.firstChild);
475 | }
476 | link.parentNode.replaceChild(container, link);
477 | }
478 | } else {
479 | link.setAttribute('href', toAbsoluteURI(href));
480 | }
481 | }
482 | });
483 |
484 | var medias = this._getAllNodesWithTag(articleContent, [
485 | 'img',
486 | 'picture',
487 | 'figure',
488 | 'video',
489 | 'audio',
490 | 'source',
491 | ]);
492 |
493 | this._forEachNode(medias, function (media) {
494 | var src = media.getAttribute('src');
495 | var poster = media.getAttribute('poster');
496 | var srcset = media.getAttribute('srcset');
497 |
498 | if (src) {
499 | media.setAttribute('src', toAbsoluteURI(src));
500 | }
501 |
502 | if (poster) {
503 | media.setAttribute('poster', toAbsoluteURI(poster));
504 | }
505 |
506 | if (srcset) {
507 | var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function (_, p1, p2, p3) {
508 | return toAbsoluteURI(p1) + (p2 || '') + p3;
509 | });
510 |
511 | media.setAttribute('srcset', newSrcset);
512 | }
513 | });
514 | },
515 |
516 | _simplifyNestedElements(articleContent) {
517 | var node = articleContent;
518 |
519 | while (node) {
520 | if (
521 | node.parentNode &&
522 | ['DIV', 'SECTION'].includes(node.tagName) &&
523 | !(node.id && node.id.startsWith('readability'))
524 | ) {
525 | if (this._isElementWithoutContent(node)) {
526 | node = this._removeAndGetNext(node);
527 | continue;
528 | } else if (
529 | this._hasSingleTagInsideElement(node, 'DIV') ||
530 | this._hasSingleTagInsideElement(node, 'SECTION')
531 | ) {
532 | var child = node.children[0];
533 | for (var i = 0; i < node.attributes.length; i++) {
534 | child.setAttributeNode(node.attributes[i].cloneNode());
535 | }
536 | node.parentNode.replaceChild(child, node);
537 | node = child;
538 | continue;
539 | }
540 | }
541 |
542 | node = this._getNextNode(node);
543 | }
544 | },
545 |
546 | /**
547 | * Get the article title as an H1.
548 | *
549 | * @return string
550 | **/
551 | _getArticleTitle() {
552 | var doc = this._doc;
553 | var curTitle = '';
554 | var origTitle = '';
555 |
556 | try {
557 | curTitle = origTitle = doc.title.trim();
558 |
559 | // If they had an element with id "title" in their HTML
560 | if (typeof curTitle !== 'string') {
561 | curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
562 | }
563 | } catch (e) {
564 | /* ignore exceptions setting the title. */
565 | }
566 |
567 | var titleHadHierarchicalSeparators = false;
568 | function wordCount(str) {
569 | return str.split(/\s+/).length;
570 | }
571 |
572 | // If there's a separator in the title, first remove the final part
573 | if (/ [\|\-\\\/>»] /.test(curTitle)) {
574 | titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
575 | let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi));
576 | curTitle = origTitle.substring(0, allSeparators.pop().index);
577 |
578 | // If the resulting title is too short, remove the first part instead:
579 | if (wordCount(curTitle) < 3) {
580 | curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, '');
581 | }
582 | } else if (curTitle.includes(': ')) {
583 | // Check if we have an heading containing this exact string, so we
584 | // could assume it's the full title.
585 | var headings = this._getAllNodesWithTag(doc, ['h1', 'h2']);
586 | var trimmedTitle = curTitle.trim();
587 | var match = this._someNode(headings, function (heading) {
588 | return heading.textContent.trim() === trimmedTitle;
589 | });
590 |
591 | // If we don't, let's extract the title out of the original title string.
592 | if (!match) {
593 | curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
594 |
595 | // If the title is now too short, try the first colon instead:
596 | if (wordCount(curTitle) < 3) {
597 | curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
598 | // But if we have too many words before the colon there's something weird
599 | // with the titles and the H tags so let's just use the original title instead
600 | } else if (wordCount(origTitle.substr(0, origTitle.indexOf(':'))) > 5) {
601 | curTitle = origTitle;
602 | }
603 | }
604 | } else if (curTitle.length > 150 || curTitle.length < 15) {
605 | var hOnes = doc.getElementsByTagName('h1');
606 |
607 | if (hOnes.length === 1) {
608 | curTitle = this._getInnerText(hOnes[0]);
609 | }
610 | }
611 |
612 | curTitle = curTitle.trim().replace(this.REGEXPS.normalize, ' ');
613 | // If we now have 4 words or fewer as our title, and either no
614 | // 'hierarchical' separators (\, /, > or ») were found in the original
615 | // title or we decreased the number of words by more than 1 word, use
616 | // the original title.
617 | var curTitleWordCount = wordCount(curTitle);
618 | if (
619 | curTitleWordCount <= 4 &&
620 | (!titleHadHierarchicalSeparators ||
621 | curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, '')) - 1)
622 | ) {
623 | curTitle = origTitle;
624 | }
625 |
626 | return curTitle;
627 | },
628 |
629 | /**
630 | * Prepare the HTML document for readability to scrape it.
631 | * This includes things like stripping javascript, CSS, and handling terrible markup.
632 | *
633 | * @return void
634 | **/
635 | _prepDocument() {
636 | var doc = this._doc;
637 |
638 | // Remove all style tags in head
639 | this._removeNodes(this._getAllNodesWithTag(doc, ['style']));
640 |
641 | if (doc.body) {
642 | this._replaceBrs(doc.body);
643 | }
644 |
645 | this._replaceNodeTags(this._getAllNodesWithTag(doc, ['font']), 'SPAN');
646 | },
647 |
648 | /**
649 | * Finds the next node, starting from the given node, and ignoring
650 | * whitespace in between. If the given node is an element, the same node is
651 | * returned.
652 | */
653 | _nextNode(node) {
654 | var next = node;
655 | while (
656 | next &&
657 | next.nodeType != this.ELEMENT_NODE &&
658 | this.REGEXPS.whitespace.test(next.textContent)
659 | ) {
660 | next = next.nextSibling;
661 | }
662 | return next;
663 | },
664 |
665 | /**
666 | * Replaces 2 or more successive <br> elements with a single <p>.
667 | * Whitespace between <br> elements are ignored. For example:
668 | * <div>foo<br>bar<br> <br><br>abc</div>
669 | * will become:
670 | * <div>foo<br>bar<p>abc</p></div>
671 | */
672 | _replaceBrs(elem) {
673 | this._forEachNode(this._getAllNodesWithTag(elem, ['br']), function (br) {
674 | var next = br.nextSibling;
675 |
676 | // Whether 2 or more <br> elements have been found and replaced with a
677 | // <p> block.
678 | var replaced = false;
679 |
680 | // If we find a <br> chain, remove the <br>s until we hit another node
681 | // or non-whitespace. This leaves behind the first <br> in the chain
682 | // (which will be replaced with a <p> later).
683 | while ((next = this._nextNode(next)) && next.tagName == 'BR') {
684 | replaced = true;
685 | var brSibling = next.nextSibling;
686 | next.remove();
687 | next = brSibling;
688 | }
689 |
690 | // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
691 | // all sibling nodes as children of the <p> until we hit another <br>
692 | // chain.
693 | if (replaced) {
694 | var p = this._doc.createElement('p');
695 | br.parentNode.replaceChild(p, br);
696 |
697 | next = p.nextSibling;
698 | while (next) {
699 | // If we've hit another <br><br>, we're done adding children to this <p>.
700 | if (next.tagName == 'BR') {
701 | var nextElem = this._nextNode(next.nextSibling);
702 | if (nextElem && nextElem.tagName == 'BR') {
703 | break;
704 | }
705 | }
706 |
707 | if (!this._isPhrasingContent(next)) {
708 | break;
709 | }
710 |
711 | // Otherwise, make this node a child of the new <p>.
712 | var sibling = next.nextSibling;
713 | p.appendChild(next);
714 | next = sibling;
715 | }
716 |
717 | while (p.lastChild && this._isWhitespace(p.lastChild)) {
718 | p.lastChild.remove();
719 | }
720 |
721 | if (p.parentNode.tagName === 'P') {
722 | this._setNodeTag(p.parentNode, 'DIV');
723 | }
724 | }
725 | });
726 | },
727 |
728 | _setNodeTag(node, tag) {
729 | this.log('_setNodeTag', node, tag);
730 | if (this._docJSDOMParser) {
731 | node.localName = tag.toLowerCase();
732 | node.tagName = tag.toUpperCase();
733 | return node;
734 | }
735 |
736 | var replacement = node.ownerDocument.createElement(tag);
737 | while (node.firstChild) {
738 | replacement.appendChild(node.firstChild);
739 | }
740 | node.parentNode.replaceChild(replacement, node);
741 | if (node.readability) {
742 | replacement.readability = node.readability;
743 | }
744 |
745 | for (var i = 0; i < node.attributes.length; i++) {
746 | replacement.setAttributeNode(node.attributes[i].cloneNode());
747 | }
748 | return replacement;
749 | },
750 |
751 | /**
752 | * Prepare the article node for display. Clean out any inline styles,
753 | * iframes, forms, strip extraneous <p> tags, etc.
754 | *
755 | * @param Element
756 | * @return void
757 | **/
758 | _prepArticle(articleContent) {
759 | this._cleanStyles(articleContent);
760 |
761 | // Check for data tables before we continue, to avoid removing items in
762 | // those tables, which will often be isolated even though they're
763 | // visually linked to other content-ful elements (text, images, etc.).
764 | this._markDataTables(articleContent);
765 |
766 | this._fixLazyImages(articleContent);
767 |
768 | // Clean out junk from the article content
769 | this._cleanConditionally(articleContent, 'form');
770 | this._cleanConditionally(articleContent, 'fieldset');
771 | this._clean(articleContent, 'object');
772 | this._clean(articleContent, 'embed');
773 | this._clean(articleContent, 'footer');
774 | this._clean(articleContent, 'link');
775 | this._clean(articleContent, 'aside');
776 |
777 | // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
778 | // which means we don't remove the top candidates even they have "share".
779 |
780 | var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
781 |
782 | this._forEachNode(articleContent.children, function (topCandidate) {
783 | this._cleanMatchedNodes(topCandidate, function (node, matchString) {
784 | return (
785 | this.REGEXPS.shareElements.test(matchString) &&
786 | node.textContent.length < shareElementThreshold
787 | );
788 | });
789 | });
790 |
791 | this._clean(articleContent, 'iframe');
792 | this._clean(articleContent, 'input');
793 | this._clean(articleContent, 'textarea');
794 | this._clean(articleContent, 'select');
795 | this._clean(articleContent, 'button');
796 | this._cleanHeaders(articleContent);
797 |
798 | // Do these last as the previous stuff may have removed junk
799 | // that will affect these
800 | this._cleanConditionally(articleContent, 'table');
801 | this._cleanConditionally(articleContent, 'ul');
802 | this._cleanConditionally(articleContent, 'div');
803 |
804 | // replace H1 with H2 as H1 should be only title that is displayed separately
805 | this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ['h1']), 'h2');
806 |
807 | // Remove extra paragraphs
808 | this._removeNodes(this._getAllNodesWithTag(articleContent, ['p']), function (paragraph) {
809 | // At this point, nasty iframes have been removed; only embedded video
810 | // ones remain.
811 | var contentElementCount = this._getAllNodesWithTag(paragraph, [
812 | 'img',
813 | 'embed',
814 | 'object',
815 | 'iframe',
816 | ]).length;
817 | return contentElementCount === 0 && !this._getInnerText(paragraph, false);
818 | });
819 |
820 | this._forEachNode(this._getAllNodesWithTag(articleContent, ['br']), function (br) {
821 | var next = this._nextNode(br.nextSibling);
822 | if (next && next.tagName == 'P') {
823 | br.remove();
824 | }
825 | });
826 |
827 | // Remove single-cell tables
828 | this._forEachNode(this._getAllNodesWithTag(articleContent, ['table']), function (table) {
829 | var tbody = this._hasSingleTagInsideElement(table, 'TBODY')
830 | ? table.firstElementChild
831 | : table;
832 | if (this._hasSingleTagInsideElement(tbody, 'TR')) {
833 | var row = tbody.firstElementChild;
834 | if (this._hasSingleTagInsideElement(row, 'TD')) {
835 | var cell = row.firstElementChild;
836 | cell = this._setNodeTag(
837 | cell,
838 | this._everyNode(cell.childNodes, this._isPhrasingContent) ? 'P' : 'DIV',
839 | );
840 | table.parentNode.replaceChild(cell, table);
841 | }
842 | }
843 | });
844 | },
845 |
846 | /**
847 | * Initialize a node with the readability object. Also checks the
848 | * className/id for special names to add to its score.
849 | *
850 | * @param Element
851 | * @return void
852 | **/
853 | _initializeNode(node) {
854 | node.readability = { contentScore: 0 };
855 |
856 | switch (node.tagName) {
857 | case 'DIV':
858 | node.readability.contentScore += 5;
859 | break;
860 |
861 | case 'PRE':
862 | case 'TD':
863 | case 'BLOCKQUOTE':
864 | node.readability.contentScore += 3;
865 | break;
866 |
867 | case 'ADDRESS':
868 | case 'OL':
869 | case 'UL':
870 | case 'DL':
871 | case 'DD':
872 | case 'DT':
873 | case 'LI':
874 | case 'FORM':
875 | node.readability.contentScore -= 3;
876 | break;
877 |
878 | case 'H1':
879 | case 'H2':
880 | case 'H3':
881 | case 'H4':
882 | case 'H5':
883 | case 'H6':
884 | case 'TH':
885 | node.readability.contentScore -= 5;
886 | break;
887 | }
888 |
889 | node.readability.contentScore += this._getClassWeight(node);
890 | },
891 |
892 | _removeAndGetNext(node) {
893 | var nextNode = this._getNextNode(node, true);
894 | node.remove();
895 | return nextNode;
896 | },
897 |
898 | /**
899 | * Traverse the DOM from node to node, starting at the node passed in.
900 | * Pass true for the second parameter to indicate this node itself
901 | * (and its kids) are going away, and we want the next node over.
902 | *
903 | * Calling this in a loop will traverse the DOM depth-first.
904 | *
905 | * @param {Element} node
906 | * @param {boolean} ignoreSelfAndKids
907 | * @return {Element}
908 | */
909 | _getNextNode(node, ignoreSelfAndKids) {
910 | // First check for kids if those aren't being ignored
911 | if (!ignoreSelfAndKids && node.firstElementChild) {
912 | return node.firstElementChild;
913 | }
914 | // Then for siblings...
915 | if (node.nextElementSibling) {
916 | return node.nextElementSibling;
917 | }
918 | // And finally, move up the parent chain *and* find a sibling
919 | // (because this is depth-first traversal, we will have already
920 | // seen the parent nodes themselves).
921 | do {
922 | node = node.parentNode;
923 | } while (node && !node.nextElementSibling);
924 | return node && node.nextElementSibling;
925 | },
926 |
927 | // compares second text to first one
928 | // 1 = same text, 0 = completely different text
929 | // works the way that it splits both texts into words and then finds words that are unique in second text
930 | // the result is given by the lower length of unique parts
931 | _textSimilarity(textA, textB) {
932 | var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
933 | var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
934 | if (!tokensA.length || !tokensB.length) {
935 | return 0;
936 | }
937 | var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token));
938 | var distanceB = uniqTokensB.join(' ').length / tokensB.join(' ').length;
939 | return 1 - distanceB;
940 | },
941 |
942 | /**
943 | * Checks whether an element node contains a valid byline
944 | *
945 | * @param node {Element}
946 | * @param matchString {string}
947 | * @return boolean
948 | */
949 | _isValidByline(node, matchString) {
950 | var rel = node.getAttribute('rel');
951 | var itemprop = node.getAttribute('itemprop');
952 | var bylineLength = node.textContent.trim().length;
953 |
954 | return (
955 | (rel === 'author' ||
956 | (itemprop && itemprop.includes('author')) ||
957 | this.REGEXPS.byline.test(matchString)) &&
958 | !!bylineLength &&
959 | bylineLength < 100
960 | );
961 | },
962 |
963 | _getNodeAncestors(node, maxDepth) {
964 | maxDepth = maxDepth || 0;
965 | var i = 0,
966 | ancestors = [];
967 | while (node.parentNode) {
968 | ancestors.push(node.parentNode);
969 | if (maxDepth && ++i === maxDepth) {
970 | break;
971 | }
972 | node = node.parentNode;
973 | }
974 | return ancestors;
975 | },
976 |
977 | /***
978 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
979 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
980 | *
981 | * @param page a document to run upon. Needs to be a full document, complete with body.
982 | * @return Element
983 | **/
984 |
985 | _grabArticle(page) {
986 | this.log('**** grabArticle ****');
987 | var doc = this._doc;
988 | var isPaging = page !== null;
989 | page = page ? page : this._doc.body;
990 |
991 | // We can't grab an article if we don't have a page!
992 | if (!page) {
993 | this.log('No body found in document. Abort.');
994 | return null;
995 | }
996 |
997 | var pageCacheHtml = page.innerHTML;
998 |
999 | while (true) {
1000 | this.log('Starting grabArticle loop');
1001 | var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
1002 |
1003 | // First, node prepping. Trash nodes that look cruddy (like ones with the
1004 | // class name "comment", etc), and turn divs into P tags where they have been
1005 | // used inappropriately (as in, where they contain no other block level elements.)
1006 | var elementsToScore = [];
1007 | var node = this._doc.documentElement;
1008 |
1009 | let shouldRemoveTitleHeader = true;
1010 |
1011 | while (node) {
1012 | if (node.tagName === 'HTML') {
1013 | this._articleLang = node.getAttribute('lang');
1014 | }
1015 |
1016 | var matchString = node.className + ' ' + node.id;
1017 |
1018 | if (!this._isProbablyVisible(node)) {
1019 | this.log('Removing hidden node - ' + matchString);
1020 | node = this._removeAndGetNext(node);
1021 | continue;
1022 | }
1023 |
1024 | // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
1025 | if (node.getAttribute('aria-modal') == 'true' && node.getAttribute('role') == 'dialog') {
1026 | node = this._removeAndGetNext(node);
1027 | continue;
1028 | }
1029 |
1030 | // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node.
1031 | if (
1032 | !this._articleByline &&
1033 | !this._metadata.byline &&
1034 | this._isValidByline(node, matchString)
1035 | ) {
1036 | // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline
1037 | var endOfSearchMarkerNode = this._getNextNode(node, true);
1038 | var next = this._getNextNode(node);
1039 | var itemPropNameNode = null;
1040 | while (next && next != endOfSearchMarkerNode) {
1041 | var itemprop = next.getAttribute('itemprop');
1042 | if (itemprop && itemprop.includes('name')) {
1043 | itemPropNameNode = next;
1044 | break;
1045 | } else {
1046 | next = this._getNextNode(next);
1047 | }
1048 | }
1049 | this._articleByline = (itemPropNameNode ?? node).textContent.trim();
1050 | node = this._removeAndGetNext(node);
1051 | continue;
1052 | }
1053 |
1054 | if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
1055 | this.log('Removing header: ', node.textContent.trim(), this._articleTitle.trim());
1056 | shouldRemoveTitleHeader = false;
1057 | node = this._removeAndGetNext(node);
1058 | continue;
1059 | }
1060 |
1061 | // Remove unlikely candidates
1062 | if (stripUnlikelyCandidates) {
1063 | if (
1064 | this.REGEXPS.unlikelyCandidates.test(matchString) &&
1065 | !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
1066 | !this._hasAncestorTag(node, 'table') &&
1067 | !this._hasAncestorTag(node, 'code') &&
1068 | node.tagName !== 'BODY' &&
1069 | node.tagName !== 'A'
1070 | ) {
1071 | this.log('Removing unlikely candidate - ' + matchString);
1072 | node = this._removeAndGetNext(node);
1073 | continue;
1074 | }
1075 |
1076 | if (this.UNLIKELY_ROLES.includes(node.getAttribute('role'))) {
1077 | this.log(
1078 | 'Removing content with role ' + node.getAttribute('role') + ' - ' + matchString,
1079 | );
1080 | node = this._removeAndGetNext(node);
1081 | continue;
1082 | }
1083 | }
1084 |
1085 | // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
1086 | if (
1087 | (node.tagName === 'DIV' ||
1088 | node.tagName === 'SECTION' ||
1089 | node.tagName === 'HEADER' ||
1090 | node.tagName === 'H1' ||
1091 | node.tagName === 'H2' ||
1092 | node.tagName === 'H3' ||
1093 | node.tagName === 'H4' ||
1094 | node.tagName === 'H5' ||
1095 | node.tagName === 'H6') &&
1096 | this._isElementWithoutContent(node)
1097 | ) {
1098 | node = this._removeAndGetNext(node);
1099 | continue;
1100 | }
1101 |
1102 | if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
1103 | elementsToScore.push(node);
1104 | }
1105 |
1106 | // Turn all divs that don't have children block level elements into p's
1107 | if (node.tagName === 'DIV') {
1108 | // Put phrasing content into paragraphs.
1109 | var p = null;
1110 | var childNode = node.firstChild;
1111 | while (childNode) {
1112 | var nextSibling = childNode.nextSibling;
1113 | if (this._isPhrasingContent(childNode)) {
1114 | if (p !== null) {
1115 | p.appendChild(childNode);
1116 | } else if (!this._isWhitespace(childNode)) {
1117 | p = doc.createElement('p');
1118 | node.replaceChild(p, childNode);
1119 | p.appendChild(childNode);
1120 | }
1121 | } else if (p !== null) {
1122 | while (p.lastChild && this._isWhitespace(p.lastChild)) {
1123 | p.lastChild.remove();
1124 | }
1125 | p = null;
1126 | }
1127 | childNode = nextSibling;
1128 | }
1129 |
1130 | // Sites like http://mobile.slate.com encloses each paragraph with a DIV
1131 | // element. DIVs with only a P element inside and no text content can be
1132 | // safely converted into plain P elements to avoid confusing the scoring
1133 | // algorithm with DIVs with are, in practice, paragraphs.
1134 | if (this._hasSingleTagInsideElement(node, 'P') && this._getLinkDensity(node) < 0.25) {
1135 | var newNode = node.children[0];
1136 | node.parentNode.replaceChild(newNode, node);
1137 | node = newNode;
1138 | elementsToScore.push(node);
1139 | } else if (!this._hasChildBlockElement(node)) {
1140 | node = this._setNodeTag(node, 'P');
1141 | elementsToScore.push(node);
1142 | }
1143 | }
1144 | node = this._getNextNode(node);
1145 | }
1146 |
1147 | /**
1148 | * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1149 | * Then add their score to their parent node.
1150 | *
1151 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1152 | **/
1153 | var candidates = [];
1154 | this._forEachNode(elementsToScore, function (elementToScore) {
1155 | if (
1156 | !elementToScore.parentNode ||
1157 | typeof elementToScore.parentNode.tagName === 'undefined'
1158 | ) {
1159 | return;
1160 | }
1161 |
1162 | // If this paragraph is less than 25 characters, don't even count it.
1163 | var innerText = this._getInnerText(elementToScore);
1164 | if (innerText.length < 25) {
1165 | return;
1166 | }
1167 |
1168 | // Exclude nodes with no ancestor.
1169 | var ancestors = this._getNodeAncestors(elementToScore, 5);
1170 | if (ancestors.length === 0) {
1171 | return;
1172 | }
1173 |
1174 | var contentScore = 0;
1175 |
1176 | // Add a point for the paragraph itself as a base.
1177 | contentScore += 1;
1178 |
1179 | // Add points for any commas within this paragraph.
1180 | contentScore += innerText.split(this.REGEXPS.commas).length;
1181 |
1182 | // For every 100 characters in this paragraph, add another point. Up to 3 points.
1183 | contentScore += Math.min(Math.floor(innerText.length / 100), 3);
1184 |
1185 | // Initialize and score ancestors.
1186 | this._forEachNode(ancestors, function (ancestor, level) {
1187 | if (
1188 | !ancestor.tagName ||
1189 | !ancestor.parentNode ||
1190 | typeof ancestor.parentNode.tagName === 'undefined'
1191 | ) {
1192 | return;
1193 | }
1194 |
1195 | if (typeof ancestor.readability === 'undefined') {
1196 | this._initializeNode(ancestor);
1197 | candidates.push(ancestor);
1198 | }
1199 |
1200 | // Node score divider:
1201 | // - parent: 1 (no division)
1202 | // - grandparent: 2
1203 | // - great grandparent+: ancestor level * 3
1204 | if (level === 0) {
1205 | var scoreDivider = 1;
1206 | } else if (level === 1) {
1207 | scoreDivider = 2;
1208 | } else {
1209 | scoreDivider = level * 3;
1210 | }
1211 | ancestor.readability.contentScore += contentScore / scoreDivider;
1212 | });
1213 | });
1214 |
1215 | // After we've calculated scores, loop through all of the possible
1216 | // candidate nodes we found and find the one with the highest score.
1217 | var topCandidates = [];
1218 | for (var c = 0, cl = candidates.length; c < cl; c += 1) {
1219 | var candidate = candidates[c];
1220 |
1221 | // Scale the final candidates score based on link density. Good content
1222 | // should have a relatively small link density (5% or less) and be mostly
1223 | // unaffected by this operation.
1224 | var candidateScore =
1225 | candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
1226 | candidate.readability.contentScore = candidateScore;
1227 |
1228 | this.log('Candidate:', candidate, 'with score ' + candidateScore);
1229 |
1230 | for (var t = 0; t < this._nbTopCandidates; t++) {
1231 | var aTopCandidate = topCandidates[t];
1232 |
1233 | if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
1234 | topCandidates.splice(t, 0, candidate);
1235 | if (topCandidates.length > this._nbTopCandidates) {
1236 | topCandidates.pop();
1237 | }
1238 | break;
1239 | }
1240 | }
1241 | }
1242 |
1243 | var topCandidate = topCandidates[0] || null;
1244 | var neededToCreateTopCandidate = false;
1245 | var parentOfTopCandidate;
1246 |
1247 | // If we still have no top candidate, just use the body as a last resort.
1248 | // We also have to copy the body node so it is something we can modify.
1249 | if (topCandidate === null || topCandidate.tagName === 'BODY') {
1250 | // Move all of the page's children into topCandidate
1251 | topCandidate = doc.createElement('DIV');
1252 | neededToCreateTopCandidate = true;
1253 | // Move everything (not just elements, also text nodes etc.) into the container
1254 | // so we even include text directly in the body:
1255 | while (page.firstChild) {
1256 | this.log('Moving child out:', page.firstChild);
1257 | topCandidate.appendChild(page.firstChild);
1258 | }
1259 |
1260 | page.appendChild(topCandidate);
1261 |
1262 | this._initializeNode(topCandidate);
1263 | } else if (topCandidate) {
1264 | // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1265 | // and whose scores are quite closed with current `topCandidate` node.
1266 | var alternativeCandidateAncestors = [];
1267 | for (var i = 1; i < topCandidates.length; i++) {
1268 | if (
1269 | topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >=
1270 | 0.75
1271 | ) {
1272 | alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
1273 | }
1274 | }
1275 | var MINIMUM_TOPCANDIDATES = 3;
1276 | if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
1277 | parentOfTopCandidate = topCandidate.parentNode;
1278 | while (parentOfTopCandidate && parentOfTopCandidate.tagName !== 'BODY') {
1279 | var listsContainingThisAncestor = 0;
1280 | for (
1281 | var ancestorIndex = 0;
1282 | ancestorIndex < alternativeCandidateAncestors.length &&
1283 | listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
1284 | ancestorIndex++
1285 | ) {
1286 | listsContainingThisAncestor += Number(
1287 | alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate),
1288 | );
1289 | }
1290 | if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
1291 | topCandidate = parentOfTopCandidate;
1292 | break;
1293 | }
1294 | parentOfTopCandidate = parentOfTopCandidate.parentNode;
1295 | }
1296 | }
1297 | if (!topCandidate.readability) {
1298 | this._initializeNode(topCandidate);
1299 | }
1300 |
1301 | // Because of our bonus system, parents of candidates might have scores
1302 | // themselves. They get half of the node. There won't be nodes with higher
1303 | // scores than our topCandidate, but if we see the score going *up* in the first
1304 | // few steps up the tree, that's a decent sign that there might be more content
1305 | // lurking in other places that we want to unify in. The sibling stuff
1306 | // below does some of that - but only if we've looked high enough up the DOM
1307 | // tree.
1308 | parentOfTopCandidate = topCandidate.parentNode;
1309 | var lastScore = topCandidate.readability.contentScore;
1310 | // The scores shouldn't get too low.
1311 | var scoreThreshold = lastScore / 3;
1312 | while (parentOfTopCandidate && parentOfTopCandidate.tagName !== 'BODY') {
1313 | if (!parentOfTopCandidate.readability) {
1314 | parentOfTopCandidate = parentOfTopCandidate.parentNode;
1315 | continue;
1316 | }
1317 | var parentScore = parentOfTopCandidate.readability.contentScore;
1318 | if (parentScore < scoreThreshold) {
1319 | break;
1320 | }
1321 | if (parentScore > lastScore) {
1322 | // Alright! We found a better parent to use.
1323 | topCandidate = parentOfTopCandidate;
1324 | break;
1325 | }
1326 | lastScore = parentOfTopCandidate.readability.contentScore;
1327 | parentOfTopCandidate = parentOfTopCandidate.parentNode;
1328 | }
1329 |
1330 | // If the top candidate is the only child, use parent instead. This will help sibling
1331 | // joining logic when adjacent content is actually located in parent's sibling node.
1332 | parentOfTopCandidate = topCandidate.parentNode;
1333 | while (
1334 | parentOfTopCandidate &&
1335 | parentOfTopCandidate.tagName != 'BODY' &&
1336 | parentOfTopCandidate.children.length == 1
1337 | ) {
1338 | topCandidate = parentOfTopCandidate;
1339 | parentOfTopCandidate = topCandidate.parentNode;
1340 | }
1341 | if (!topCandidate.readability) {
1342 | this._initializeNode(topCandidate);
1343 | }
1344 | }
1345 |
1346 | // Now that we have the top candidate, look through its siblings for content
1347 | // that might also be related. Things like preambles, content split by ads
1348 | // that we removed, etc.
1349 | var articleContent = doc.createElement('DIV');
1350 | if (isPaging) {
1351 | articleContent.id = 'readability-content';
1352 | }
1353 |
1354 | var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
1355 | // Keep potential top candidate's parent node to try to get text direction of it later.
1356 | parentOfTopCandidate = topCandidate.parentNode;
1357 | var siblings = parentOfTopCandidate.children;
1358 |
1359 | for (var s = 0, sl = siblings.length; s < sl; s++) {
1360 | var sibling = siblings[s];
1361 | var append = false;
1362 |
1363 | this.log(
1364 | 'Looking at sibling node:',
1365 | sibling,
1366 | sibling.readability ? 'with score ' + sibling.readability.contentScore : '',
1367 | );
1368 | this.log(
1369 | 'Sibling has score',
1370 | sibling.readability ? sibling.readability.contentScore : 'Unknown',
1371 | );
1372 |
1373 | if (sibling === topCandidate) {
1374 | append = true;
1375 | } else {
1376 | var contentBonus = 0;
1377 |
1378 | // Give a bonus if sibling nodes and top candidates have the example same classname
1379 | if (sibling.className === topCandidate.className && topCandidate.className !== '') {
1380 | contentBonus += topCandidate.readability.contentScore * 0.2;
1381 | }
1382 |
1383 | if (
1384 | sibling.readability &&
1385 | sibling.readability.contentScore + contentBonus >= siblingScoreThreshold
1386 | ) {
1387 | append = true;
1388 | } else if (sibling.nodeName === 'P') {
1389 | var linkDensity = this._getLinkDensity(sibling);
1390 | var nodeContent = this._getInnerText(sibling);
1391 | var nodeLength = nodeContent.length;
1392 |
1393 | if (nodeLength > 80 && linkDensity < 0.25) {
1394 | append = true;
1395 | } else if (
1396 | nodeLength < 80 &&
1397 | nodeLength > 0 &&
1398 | linkDensity === 0 &&
1399 | nodeContent.search(/\.( |$)/) !== -1
1400 | ) {
1401 | append = true;
1402 | }
1403 | }
1404 | }
1405 |
1406 | if (append) {
1407 | this.log('Appending node:', sibling);
1408 |
1409 | if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) {
1410 | // We have a node that isn't a common block level element, like a form or td tag.
1411 | // Turn it into a div so it doesn't get filtered out later by accident.
1412 | this.log('Altering sibling:', sibling, 'to div.');
1413 |
1414 | sibling = this._setNodeTag(sibling, 'DIV');
1415 | }
1416 |
1417 | articleContent.appendChild(sibling);
1418 | // Fetch children again to make it compatible
1419 | // with DOM parsers without live collection support.
1420 | siblings = parentOfTopCandidate.children;
1421 | // siblings is a reference to the children array, and
1422 | // sibling is removed from the array when we call appendChild().
1423 | // As a result, we must revisit this index since the nodes
1424 | // have been shifted.
1425 | s -= 1;
1426 | sl -= 1;
1427 | }
1428 | }
1429 |
1430 | if (this._debug) {
1431 | this.log('Article content pre-prep: ' + articleContent.innerHTML);
1432 | }
1433 | // So we have all of the content that we need. Now we clean it up for presentation.
1434 | this._prepArticle(articleContent);
1435 | if (this._debug) {
1436 | this.log('Article content post-prep: ' + articleContent.innerHTML);
1437 | }
1438 |
1439 | if (neededToCreateTopCandidate) {
1440 | // We already created a fake div thing, and there wouldn't have been any siblings left
1441 | // for the previous loop, so there's no point trying to create a new div, and then
1442 | // move all the children over. Just assign IDs and class names here. No need to append
1443 | // because that already happened anyway.
1444 | topCandidate.id = 'readability-page-1';
1445 | topCandidate.className = 'page';
1446 | } else {
1447 | var div = doc.createElement('DIV');
1448 | div.id = 'readability-page-1';
1449 | div.className = 'page';
1450 | while (articleContent.firstChild) {
1451 | div.appendChild(articleContent.firstChild);
1452 | }
1453 | articleContent.appendChild(div);
1454 | }
1455 |
1456 | if (this._debug) {
1457 | this.log('Article content after paging: ' + articleContent.innerHTML);
1458 | }
1459 |
1460 | var parseSuccessful = true;
1461 |
1462 | // Now that we've gone through the full algorithm, check to see if
1463 | // we got any meaningful content. If we didn't, we may need to re-run
1464 | // grabArticle with different flags set. This gives us a higher likelihood of
1465 | // finding the content, and the sieve approach gives us a higher likelihood of
1466 | // finding the -right- content.
1467 | var textLength = this._getInnerText(articleContent, true).length;
1468 | if (textLength < this._charThreshold) {
1469 | parseSuccessful = false;
1470 | // eslint-disable-next-line no-unsanitized/property
1471 | page.innerHTML = pageCacheHtml;
1472 |
1473 | this._attempts.push({
1474 | articleContent,
1475 | textLength,
1476 | });
1477 |
1478 | if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
1479 | this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
1480 | } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
1481 | this._removeFlag(this.FLAG_WEIGHT_CLASSES);
1482 | } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
1483 | this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
1484 | } else {
1485 | // No luck after removing flags, just return the longest text we found during the different loops
1486 | this._attempts.sort(function (a, b) {
1487 | return b.textLength - a.textLength;
1488 | });
1489 |
1490 | // But first check if we actually have something
1491 | if (!this._attempts[0].textLength) {
1492 | return null;
1493 | }
1494 |
1495 | articleContent = this._attempts[0].articleContent;
1496 | parseSuccessful = true;
1497 | }
1498 | }
1499 |
1500 | if (parseSuccessful) {
1501 | // Find out text direction from ancestors of final top candidate.
1502 | var ancestors = [parentOfTopCandidate, topCandidate].concat(
1503 | this._getNodeAncestors(parentOfTopCandidate),
1504 | );
1505 | this._someNode(ancestors, function (ancestor) {
1506 | if (!ancestor.tagName) {
1507 | return false;
1508 | }
1509 | var articleDir = ancestor.getAttribute('dir');
1510 | if (articleDir) {
1511 | this._articleDir = articleDir;
1512 | return true;
1513 | }
1514 | return false;
1515 | });
1516 | return articleContent;
1517 | }
1518 | }
1519 | },
1520 |
1521 | /**
1522 | * Converts some of the common HTML entities in string to their corresponding characters.
1523 | *
1524 | * @param str {string} - a string to unescape.
1525 | * @return string without HTML entity.
1526 | */
1527 | _unescapeHtmlEntities(str) {
1528 | if (!str) {
1529 | return str;
1530 | }
1531 |
1532 | var htmlEscapeMap = this.HTML_ESCAPE_MAP;
1533 | return str
1534 | .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) {
1535 | return htmlEscapeMap[tag];
1536 | })
1537 | .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) {
1538 | var num = parseInt(hex || numStr, hex ? 16 : 10);
1539 |
1540 | // these character references are replaced by a conforming HTML parser
1541 | if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) {
1542 | num = 0xfffd;
1543 | }
1544 |
1545 | return String.fromCodePoint(num);
1546 | });
1547 | },
1548 |
1549 | /**
1550 | * Try to extract metadata from JSON-LD object.
1551 | * For now, only Schema.org objects of type Article or its subtypes are supported.
1552 | * @return Object with any metadata that could be extracted (possibly none)
1553 | */
1554 | _getJSONLD(doc) {
1555 | var scripts = this._getAllNodesWithTag(doc, ['script']);
1556 |
1557 | var metadata;
1558 |
1559 | this._forEachNode(scripts, function (jsonLdElement) {
1560 | if (!metadata && jsonLdElement.getAttribute('type') === 'application/ld+json') {
1561 | try {
1562 | // Strip CDATA markers if present
1563 | var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, '');
1564 | var parsed = JSON.parse(content);
1565 |
1566 | if (Array.isArray(parsed)) {
1567 | parsed = parsed.find((it) => {
1568 | return it['@type'] && it['@type'].match(this.REGEXPS.jsonLdArticleTypes);
1569 | });
1570 | if (!parsed) {
1571 | return;
1572 | }
1573 | }
1574 |
1575 | var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/;
1576 | var matches =
1577 | (typeof parsed['@context'] === 'string' &&
1578 | parsed['@context'].match(schemaDotOrgRegex)) ||
1579 | (typeof parsed['@context'] === 'object' &&
1580 | typeof parsed['@context']['@vocab'] == 'string' &&
1581 | parsed['@context']['@vocab'].match(schemaDotOrgRegex));
1582 |
1583 | if (!matches) {
1584 | return;
1585 | }
1586 |
1587 | if (!parsed['@type'] && Array.isArray(parsed['@graph'])) {
1588 | parsed = parsed['@graph'].find((it) => {
1589 | return (it['@type'] || '').match(this.REGEXPS.jsonLdArticleTypes);
1590 | });
1591 | }
1592 |
1593 | if (
1594 | !parsed ||
1595 | !parsed['@type'] ||
1596 | !parsed['@type'].match(this.REGEXPS.jsonLdArticleTypes)
1597 | ) {
1598 | return;
1599 | }
1600 |
1601 | metadata = {};
1602 |
1603 | if (
1604 | typeof parsed.name === 'string' &&
1605 | typeof parsed.headline === 'string' &&
1606 | parsed.name !== parsed.headline
1607 | ) {
1608 | // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
1609 | // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
1610 | // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
1611 |
1612 | var title = this._getArticleTitle();
1613 | var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
1614 | var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;
1615 |
1616 | if (headlineMatches && !nameMatches) {
1617 | metadata.title = parsed.headline;
1618 | } else {
1619 | metadata.title = parsed.name;
1620 | }
1621 | } else if (typeof parsed.name === 'string') {
1622 | metadata.title = parsed.name.trim();
1623 | } else if (typeof parsed.headline === 'string') {
1624 | metadata.title = parsed.headline.trim();
1625 | }
1626 | if (parsed.author) {
1627 | if (typeof parsed.author.name === 'string') {
1628 | metadata.byline = parsed.author.name.trim();
1629 | } else if (
1630 | Array.isArray(parsed.author) &&
1631 | parsed.author[0] &&
1632 | typeof parsed.author[0].name === 'string'
1633 | ) {
1634 | metadata.byline = parsed.author
1635 | .filter(function (author) {
1636 | return author && typeof author.name === 'string';
1637 | })
1638 | .map(function (author) {
1639 | return author.name.trim();
1640 | })
1641 | .join(', ');
1642 | }
1643 | }
1644 | if (typeof parsed.description === 'string') {
1645 | metadata.excerpt = parsed.description.trim();
1646 | }
1647 | if (parsed.publisher && typeof parsed.publisher.name === 'string') {
1648 | metadata.siteName = parsed.publisher.name.trim();
1649 | }
1650 | if (typeof parsed.datePublished === 'string') {
1651 | metadata.datePublished = parsed.datePublished.trim();
1652 | }
1653 | } catch (err) {
1654 | this.log(err.message);
1655 | }
1656 | }
1657 | });
1658 | return metadata ? metadata : {};
1659 | },
1660 |
1661 | /**
1662 | * Attempts to get excerpt and byline metadata for the article.
1663 | *
1664 | * @param {Object} jsonld — object containing any metadata that
1665 | * could be extracted from JSON-LD object.
1666 | *
1667 | * @return Object with optional "excerpt" and "byline" properties
1668 | */
1669 | _getArticleMetadata(jsonld) {
1670 | var metadata = {};
1671 | var values = {};
1672 | var metaElements = this._doc.getElementsByTagName('meta');
1673 |
1674 | // property is a space-separated list of values
1675 | var propertyPattern =
1676 | /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
1677 |
1678 | // name is a single value
1679 | var namePattern =
1680 | /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
1681 |
1682 | // Find description tags.
1683 | this._forEachNode(metaElements, function (element) {
1684 | var elementName = element.getAttribute('name');
1685 | var elementProperty = element.getAttribute('property');
1686 | var content = element.getAttribute('content');
1687 | if (!content) {
1688 | return;
1689 | }
1690 | var matches = null;
1691 | var name = null;
1692 |
1693 | if (elementProperty) {
1694 | matches = elementProperty.match(propertyPattern);
1695 | if (matches) {
1696 | // Convert to lowercase, and remove any whitespace
1697 | // so we can match below.
1698 | name = matches[0].toLowerCase().replace(/\s/g, '');
1699 | // multiple authors
1700 | values[name] = content.trim();
1701 | }
1702 | }
1703 | if (!matches && elementName && namePattern.test(elementName)) {
1704 | name = elementName;
1705 | if (content) {
1706 | // Convert to lowercase, remove any whitespace, and convert dots
1707 | // to colons so we can match below.
1708 | name = name.toLowerCase().replace(/\s/g, '').replace(/\./g, ':');
1709 | values[name] = content.trim();
1710 | }
1711 | }
1712 | });
1713 |
1714 | // get title
1715 | metadata.title =
1716 | jsonld.title ||
1717 | values['dc:title'] ||
1718 | values['dcterm:title'] ||
1719 | values['og:title'] ||
1720 | values['weibo:article:title'] ||
1721 | values['weibo:webpage:title'] ||
1722 | values.title ||
1723 | values['twitter:title'] ||
1724 | values['parsely-title'];
1725 |
1726 | if (!metadata.title) {
1727 | metadata.title = this._getArticleTitle();
1728 | }
1729 |
1730 | const articleAuthor =
1731 | typeof values['article:author'] === 'string' && !this._isUrl(values['article:author'])
1732 | ? values['article:author']
1733 | : undefined;
1734 |
1735 | // get author
1736 | metadata.byline =
1737 | jsonld.byline ||
1738 | values['dc:creator'] ||
1739 | values['dcterm:creator'] ||
1740 | values.author ||
1741 | values['parsely-author'] ||
1742 | articleAuthor;
1743 |
1744 | // get description
1745 | metadata.excerpt =
1746 | jsonld.excerpt ||
1747 | values['dc:description'] ||
1748 | values['dcterm:description'] ||
1749 | values['og:description'] ||
1750 | values['weibo:article:description'] ||
1751 | values['weibo:webpage:description'] ||
1752 | values.description ||
1753 | values['twitter:description'];
1754 |
1755 | // get site name
1756 | metadata.siteName = jsonld.siteName || values['og:site_name'];
1757 |
1758 | // get article published time
1759 | metadata.publishedTime =
1760 | jsonld.datePublished ||
1761 | values['article:published_time'] ||
1762 | values['parsely-pub-date'] ||
1763 | null;
1764 |
1765 | // in many sites the meta value is escaped with HTML entities,
1766 | // so here we need to unescape it
1767 | metadata.title = this._unescapeHtmlEntities(metadata.title);
1768 | metadata.byline = this._unescapeHtmlEntities(metadata.byline);
1769 | metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
1770 | metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
1771 | metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
1772 |
1773 | return metadata;
1774 | },
1775 |
1776 | /**
1777 | * Check if node is image, or if node contains exactly only one image
1778 | * whether as a direct child or as its descendants.
1779 | *
1780 | * @param Element
1781 | **/
1782 | _isSingleImage(node) {
1783 | while (node) {
1784 | if (node.tagName === 'IMG') {
1785 | return true;
1786 | }
1787 | if (node.children.length !== 1 || node.textContent.trim() !== '') {
1788 | return false;
1789 | }
1790 | node = node.children[0];
1791 | }
1792 | return false;
1793 | },
1794 |
1795 | /**
1796 | * Find all <noscript> that are located after <img> nodes, and which contain only one
1797 | * <img> element. Replace the first image with the image from inside the <noscript> tag,
1798 | * and remove the <noscript> tag. This improves the quality of the images we use on
1799 | * some sites (e.g. Medium).
1800 | *
1801 | * @param Element
1802 | **/
1803 | _unwrapNoscriptImages(doc) {
1804 | // Find img without source or attributes that might contains image, and remove it.
1805 | // This is done to prevent a placeholder img is replaced by img from noscript in next step.
1806 | var imgs = Array.from(doc.getElementsByTagName('img'));
1807 | this._forEachNode(imgs, function (img) {
1808 | for (var i = 0; i < img.attributes.length; i++) {
1809 | var attr = img.attributes[i];
1810 | switch (attr.name) {
1811 | case 'src':
1812 | case 'srcset':
1813 | case 'data-src':
1814 | case 'data-srcset':
1815 | return;
1816 | }
1817 |
1818 | if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
1819 | return;
1820 | }
1821 | }
1822 |
1823 | img.remove();
1824 | });
1825 |
1826 | // Next find noscript and try to extract its image
1827 | var noscripts = Array.from(doc.getElementsByTagName('noscript'));
1828 | this._forEachNode(noscripts, function (noscript) {
1829 | // Parse content of noscript and make sure it only contains image
1830 | if (!this._isSingleImage(noscript)) {
1831 | return;
1832 | }
1833 | var tmp = doc.createElement('div');
1834 | // We're running in the document context, and using unmodified
1835 | // document contents, so doing this should be safe.
1836 | // (Also we heavily discourage people from allowing script to
1837 | // run at all in this document...)
1838 | // eslint-disable-next-line no-unsanitized/property
1839 | tmp.innerHTML = noscript.innerHTML;
1840 |
1841 | // If noscript has previous sibling and it only contains image,
1842 | // replace it with noscript content. However we also keep old
1843 | // attributes that might contains image.
1844 | var prevElement = noscript.previousElementSibling;
1845 | if (prevElement && this._isSingleImage(prevElement)) {
1846 | var prevImg = prevElement;
1847 | if (prevImg.tagName !== 'IMG') {
1848 | prevImg = prevElement.getElementsByTagName('img')[0];
1849 | }
1850 |
1851 | var newImg = tmp.getElementsByTagName('img')[0];
1852 | for (var i = 0; i < prevImg.attributes.length; i++) {
1853 | var attr = prevImg.attributes[i];
1854 | if (attr.value === '') {
1855 | continue;
1856 | }
1857 |
1858 | if (
1859 | attr.name === 'src' ||
1860 | attr.name === 'srcset' ||
1861 | /\.(jpg|jpeg|png|webp)/i.test(attr.value)
1862 | ) {
1863 | if (newImg.getAttribute(attr.name) === attr.value) {
1864 | continue;
1865 | }
1866 |
1867 | var attrName = attr.name;
1868 | if (newImg.hasAttribute(attrName)) {
1869 | attrName = 'data-old-' + attrName;
1870 | }
1871 |
1872 | newImg.setAttribute(attrName, attr.value);
1873 | }
1874 | }
1875 |
1876 | noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
1877 | }
1878 | });
1879 | },
1880 |
1881 | /**
1882 | * Removes script tags from the document.
1883 | *
1884 | * @param Element
1885 | **/
1886 | _removeScripts(doc) {
1887 | this._removeNodes(this._getAllNodesWithTag(doc, ['script', 'noscript']));
1888 | },
1889 |
1890 | /**
1891 | * Check if this node has only whitespace and a single element with given tag
1892 | * Returns false if the DIV node contains non-empty text nodes
1893 | * or if it contains no element with given tag or more than 1 element.
1894 | *
1895 | * @param Element
1896 | * @param string tag of child element
1897 | **/
1898 | _hasSingleTagInsideElement(element, tag) {
1899 | // There should be exactly 1 element child with given tag
1900 | if (element.children.length != 1 || element.children[0].tagName !== tag) {
1901 | return false;
1902 | }
1903 |
1904 | // And there should be no text nodes with real content
1905 | return !this._someNode(element.childNodes, function (node) {
1906 | return node.nodeType === this.TEXT_NODE && this.REGEXPS.hasContent.test(node.textContent);
1907 | });
1908 | },
1909 |
1910 | _isElementWithoutContent(node) {
1911 | return (
1912 | node.nodeType === this.ELEMENT_NODE &&
1913 | !node.textContent.trim().length &&
1914 | (!node.children.length ||
1915 | node.children.length ==
1916 | node.getElementsByTagName('br').length + node.getElementsByTagName('hr').length)
1917 | );
1918 | },
1919 |
1920 | /**
1921 | * Determine whether element has any children block level elements.
1922 | *
1923 | * @param Element
1924 | */
1925 | _hasChildBlockElement(element) {
1926 | return this._someNode(element.childNodes, function (node) {
1927 | return this.DIV_TO_P_ELEMS.has(node.tagName) || this._hasChildBlockElement(node);
1928 | });
1929 | },
1930 |
1931 | /***
1932 | * Determine if a node qualifies as phrasing content.
1933 | * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
1934 | **/
1935 | _isPhrasingContent(node) {
1936 | return (
1937 | node.nodeType === this.TEXT_NODE ||
1938 | this.PHRASING_ELEMS.includes(node.tagName) ||
1939 | ((node.tagName === 'A' || node.tagName === 'DEL' || node.tagName === 'INS') &&
1940 | this._everyNode(node.childNodes, this._isPhrasingContent))
1941 | );
1942 | },
1943 |
1944 | _isWhitespace(node) {
1945 | return (
1946 | (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
1947 | (node.nodeType === this.ELEMENT_NODE && node.tagName === 'BR')
1948 | );
1949 | },
1950 |
1951 | /**
1952 | * Get the inner text of a node - cross browser compatibly.
1953 | * This also strips out any excess whitespace to be found.
1954 | *
1955 | * @param Element
1956 | * @param Boolean normalizeSpaces (default: true)
1957 | * @return string
1958 | **/
1959 | _getInnerText(e, normalizeSpaces) {
1960 | normalizeSpaces = typeof normalizeSpaces === 'undefined' ? true : normalizeSpaces;
1961 | var textContent = e.textContent.trim();
1962 |
1963 | if (normalizeSpaces) {
1964 | return textContent.replace(this.REGEXPS.normalize, ' ');
1965 | }
1966 | return textContent;
1967 | },
1968 |
1969 | /**
1970 | * Get the number of times a string s appears in the node e.
1971 | *
1972 | * @param Element
1973 | * @param string - what to split on. Default is ","
1974 | * @return number (integer)
1975 | **/
1976 | _getCharCount(e, s) {
1977 | s = s || ',';
1978 | return this._getInnerText(e).split(s).length - 1;
1979 | },
1980 |
1981 | /**
1982 | * Remove the style attribute on every e and under.
1983 | * TODO: Test if getElementsByTagName(*) is faster.
1984 | *
1985 | * @param Element
1986 | * @return void
1987 | **/
1988 | _cleanStyles(e) {
1989 | if (!e || e.tagName.toLowerCase() === 'svg') {
1990 | return;
1991 | }
1992 |
1993 | // Remove `style` and deprecated presentational attributes
1994 | for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
1995 | e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
1996 | }
1997 |
1998 | if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) {
1999 | e.removeAttribute('width');
2000 | e.removeAttribute('height');
2001 | }
2002 |
2003 | var cur = e.firstElementChild;
2004 | while (cur !== null) {
2005 | this._cleanStyles(cur);
2006 | cur = cur.nextElementSibling;
2007 | }
2008 | },
2009 |
2010 | /**
2011 | * Get the density of links as a percentage of the content
2012 | * This is the amount of text that is inside a link divided by the total text in the node.
2013 | *
2014 | * @param Element
2015 | * @return number (float)
2016 | **/
2017 | _getLinkDensity(element) {
2018 | var textLength = this._getInnerText(element).length;
2019 | if (textLength === 0) {
2020 | return 0;
2021 | }
2022 |
2023 | var linkLength = 0;
2024 |
2025 | // XXX implement _reduceNodeList?
2026 | this._forEachNode(element.getElementsByTagName('a'), function (linkNode) {
2027 | var href = linkNode.getAttribute('href');
2028 | var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
2029 | linkLength += this._getInnerText(linkNode).length * coefficient;
2030 | });
2031 |
2032 | return linkLength / textLength;
2033 | },
2034 |
2035 | /**
2036 | * Get an elements class/id weight. Uses regular expressions to tell if this
2037 | * element looks good or bad.
2038 | *
2039 | * @param Element
2040 | * @return number (Integer)
2041 | **/
2042 | _getClassWeight(e) {
2043 | if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
2044 | return 0;
2045 | }
2046 |
2047 | var weight = 0;
2048 |
2049 | // Look for a special classname
2050 | if (typeof e.className === 'string' && e.className !== '') {
2051 | if (this.REGEXPS.negative.test(e.className)) {
2052 | weight -= 25;
2053 | }
2054 |
2055 | if (this.REGEXPS.positive.test(e.className)) {
2056 | weight += 25;
2057 | }
2058 | }
2059 |
2060 | // Look for a special ID
2061 | if (typeof e.id === 'string' && e.id !== '') {
2062 | if (this.REGEXPS.negative.test(e.id)) {
2063 | weight -= 25;
2064 | }
2065 |
2066 | if (this.REGEXPS.positive.test(e.id)) {
2067 | weight += 25;
2068 | }
2069 | }
2070 |
2071 | return weight;
2072 | },
2073 |
2074 | /**
2075 | * Clean a node of all elements of type "tag".
2076 | * (Unless it's a youtube/vimeo video. People love movies.)
2077 | *
2078 | * @param Element
2079 | * @param string tag to clean
2080 | * @return void
2081 | **/
2082 | _clean(e, tag) {
2083 | var isEmbed = ['object', 'embed', 'iframe'].includes(tag);
2084 |
2085 | this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) {
2086 | // Allow youtube and vimeo videos through as people usually want to see those.
2087 | if (isEmbed) {
2088 | // First, check the elements attributes to see if any of them contain youtube or vimeo
2089 | for (var i = 0; i < element.attributes.length; i++) {
2090 | if (this._allowedVideoRegex.test(element.attributes[i].value)) {
2091 | return false;
2092 | }
2093 | }
2094 |
2095 | // For embed with <object> tag, check inner HTML as well.
2096 | if (element.tagName === 'object' && this._allowedVideoRegex.test(element.innerHTML)) {
2097 | return false;
2098 | }
2099 | }
2100 |
2101 | return true;
2102 | });
2103 | },
2104 |
2105 | /**
2106 | * Check if a given node has one of its ancestor tag name matching the
2107 | * provided one.
2108 | * @param HTMLElement node
2109 | * @param String tagName
2110 | * @param Number maxDepth
2111 | * @param Function filterFn a filter to invoke to determine whether this node 'counts'
2112 | * @return Boolean
2113 | */
2114 | _hasAncestorTag(node, tagName, maxDepth, filterFn) {
2115 | maxDepth = maxDepth || 3;
2116 | tagName = tagName.toUpperCase();
2117 | var depth = 0;
2118 | while (node.parentNode) {
2119 | if (maxDepth > 0 && depth > maxDepth) {
2120 | return false;
2121 | }
2122 | if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode))) {
2123 | return true;
2124 | }
2125 | node = node.parentNode;
2126 | depth++;
2127 | }
2128 | return false;
2129 | },
2130 |
2131 | /**
2132 | * Return an object indicating how many rows and columns this table has.
2133 | */
2134 | _getRowAndColumnCount(table) {
2135 | var rows = 0;
2136 | var columns = 0;
2137 | var trs = table.getElementsByTagName('tr');
2138 | for (var i = 0; i < trs.length; i++) {
2139 | var rowspan = trs[i].getAttribute('rowspan') || 0;
2140 | if (rowspan) {
2141 | rowspan = parseInt(rowspan, 10);
2142 | }
2143 | rows += rowspan || 1;
2144 |
2145 | // Now look for column-related info
2146 | var columnsInThisRow = 0;
2147 | var cells = trs[i].getElementsByTagName('td');
2148 | for (var j = 0; j < cells.length; j++) {
2149 | var colspan = cells[j].getAttribute('colspan') || 0;
2150 | if (colspan) {
2151 | colspan = parseInt(colspan, 10);
2152 | }
2153 | columnsInThisRow += colspan || 1;
2154 | }
2155 | columns = Math.max(columns, columnsInThisRow);
2156 | }
2157 | return { rows, columns };
2158 | },
2159 |
2160 | /**
2161 | * Look for 'data' (as opposed to 'layout') tables, for which we use
2162 | * similar checks as
2163 | * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
2164 | */
2165 | _markDataTables(root) {
2166 | var tables = root.getElementsByTagName('table');
2167 | for (var i = 0; i < tables.length; i++) {
2168 | var table = tables[i];
2169 | var role = table.getAttribute('role');
2170 | if (role == 'presentation') {
2171 | table._readabilityDataTable = false;
2172 | continue;
2173 | }
2174 | var datatable = table.getAttribute('datatable');
2175 | if (datatable == '0') {
2176 | table._readabilityDataTable = false;
2177 | continue;
2178 | }
2179 | var summary = table.getAttribute('summary');
2180 | if (summary) {
2181 | table._readabilityDataTable = true;
2182 | continue;
2183 | }
2184 |
2185 | var caption = table.getElementsByTagName('caption')[0];
2186 | if (caption && caption.childNodes.length) {
2187 | table._readabilityDataTable = true;
2188 | continue;
2189 | }
2190 |
2191 | // If the table has a descendant with any of these tags, consider a data table:
2192 | var dataTableDescendants = ['col', 'colgroup', 'tfoot', 'thead', 'th'];
2193 | var descendantExists = function (tag) {
2194 | return !!table.getElementsByTagName(tag)[0];
2195 | };
2196 | if (dataTableDescendants.some(descendantExists)) {
2197 | this.log('Data table because found data-y descendant');
2198 | table._readabilityDataTable = true;
2199 | continue;
2200 | }
2201 |
2202 | // Nested tables indicate a layout table:
2203 | if (table.getElementsByTagName('table')[0]) {
2204 | table._readabilityDataTable = false;
2205 | continue;
2206 | }
2207 |
2208 | var sizeInfo = this._getRowAndColumnCount(table);
2209 |
2210 | if (sizeInfo.columns == 1 || sizeInfo.rows == 1) {
2211 | // single colum/row tables are commonly used for page layout purposes.
2212 | table._readabilityDataTable = false;
2213 | continue;
2214 | }
2215 |
2216 | if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
2217 | table._readabilityDataTable = true;
2218 | continue;
2219 | }
2220 | // Now just go by size entirely:
2221 | table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
2222 | }
2223 | },
2224 |
2225 | /* convert images and figures that have properties like data-src into images that can be loaded without JS */
2226 | _fixLazyImages(root) {
2227 | this._forEachNode(
2228 | this._getAllNodesWithTag(root, ['img', 'picture', 'figure']),
2229 | function (elem) {
2230 | // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
2231 | // So, here we check if the data uri is too short, just might as well remove it.
2232 | if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
2233 | // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
2234 | var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
2235 | if (parts[1] === 'image/svg+xml') {
2236 | return;
2237 | }
2238 |
2239 | // Make sure this element has other attributes which contains image.
2240 | // If it doesn't, then this src is important and shouldn't be removed.
2241 | var srcCouldBeRemoved = false;
2242 | for (var i = 0; i < elem.attributes.length; i++) {
2243 | var attr = elem.attributes[i];
2244 | if (attr.name === 'src') {
2245 | continue;
2246 | }
2247 |
2248 | if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
2249 | srcCouldBeRemoved = true;
2250 | break;
2251 | }
2252 | }
2253 |
2254 | // Here we assume if image is less than 100 bytes (or 133 after encoded to base64)
2255 | // it will be too small, therefore it might be placeholder image.
2256 | if (srcCouldBeRemoved) {
2257 | var b64starts = parts[0].length;
2258 | var b64length = elem.src.length - b64starts;
2259 | if (b64length < 133) {
2260 | elem.removeAttribute('src');
2261 | }
2262 | }
2263 | }
2264 |
2265 | // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
2266 | if (
2267 | (elem.src || (elem.srcset && elem.srcset != 'null')) &&
2268 | !elem.className.toLowerCase().includes('lazy')
2269 | ) {
2270 | return;
2271 | }
2272 |
2273 | for (var j = 0; j < elem.attributes.length; j++) {
2274 | attr = elem.attributes[j];
2275 | if (attr.name === 'src' || attr.name === 'srcset' || attr.name === 'alt') {
2276 | continue;
2277 | }
2278 | var copyTo = null;
2279 | if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
2280 | copyTo = 'srcset';
2281 | } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
2282 | copyTo = 'src';
2283 | }
2284 | if (copyTo) {
2285 | //if this is an img or picture, set the attribute directly
2286 | if (elem.tagName === 'IMG' || elem.tagName === 'PICTURE') {
2287 | elem.setAttribute(copyTo, attr.value);
2288 | } else if (
2289 | elem.tagName === 'FIGURE' &&
2290 | !this._getAllNodesWithTag(elem, ['img', 'picture']).length
2291 | ) {
2292 | //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
2293 | //see the nytimes-3 testcase for an example
2294 | var img = this._doc.createElement('img');
2295 | img.setAttribute(copyTo, attr.value);
2296 | elem.appendChild(img);
2297 | }
2298 | }
2299 | }
2300 | },
2301 | );
2302 | },
2303 |
2304 | _getTextDensity(e, tags) {
2305 | var textLength = this._getInnerText(e, true).length;
2306 | if (textLength === 0) {
2307 | return 0;
2308 | }
2309 | var childrenLength = 0;
2310 | var children = this._getAllNodesWithTag(e, tags);
2311 | this._forEachNode(
2312 | children,
2313 | (child) => (childrenLength += this._getInnerText(child, true).length),
2314 | );
2315 | return childrenLength / textLength;
2316 | },
2317 |
2318 | /**
2319 | * Clean an element of all tags of type "tag" if they look fishy.
2320 | * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
2321 | *
2322 | * @return void
2323 | **/
2324 | _cleanConditionally(e, tag) {
2325 | if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
2326 | return;
2327 | }
2328 |
2329 | // Gather counts for other typical elements embedded within.
2330 | // Traverse backwards so we can remove nodes at the same time
2331 | // without effecting the traversal.
2332 | //
2333 | // TODO: Consider taking into account original contentScore here.
2334 | this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
2335 | // First check if this node IS data table, in which case don't remove it.
2336 | var isDataTable = function (t) {
2337 | return t._readabilityDataTable;
2338 | };
2339 |
2340 | var isList = tag === 'ul' || tag === 'ol';
2341 | if (!isList) {
2342 | var listLength = 0;
2343 | var listNodes = this._getAllNodesWithTag(node, ['ul', 'ol']);
2344 | this._forEachNode(listNodes, (list) => (listLength += this._getInnerText(list).length));
2345 | isList = listLength / this._getInnerText(node).length > 0.9;
2346 | }
2347 |
2348 | if (tag === 'table' && isDataTable(node)) {
2349 | return false;
2350 | }
2351 |
2352 | // Next check if we're inside a data table, in which case don't remove it as well.
2353 | if (this._hasAncestorTag(node, 'table', -1, isDataTable)) {
2354 | return false;
2355 | }
2356 |
2357 | if (this._hasAncestorTag(node, 'code')) {
2358 | return false;
2359 | }
2360 |
2361 | // keep element if it has a data tables
2362 | if ([...node.getElementsByTagName('table')].some((tbl) => tbl._readabilityDataTable)) {
2363 | return false;
2364 | }
2365 |
2366 | var weight = this._getClassWeight(node);
2367 |
2368 | this.log('Cleaning Conditionally', node);
2369 |
2370 | var contentScore = 0;
2371 |
2372 | if (weight + contentScore < 0) {
2373 | return true;
2374 | }
2375 |
2376 | if (this._getCharCount(node, ',') < 10) {
2377 | // If there are not very many commas, and the number of
2378 | // non-paragraph elements is more than paragraphs or other
2379 | // ominous signs, remove the element.
2380 | var p = node.getElementsByTagName('p').length;
2381 | var img = node.getElementsByTagName('img').length;
2382 | var li = node.getElementsByTagName('li').length - 100;
2383 | var input = node.getElementsByTagName('input').length;
2384 | var headingDensity = this._getTextDensity(node, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);
2385 |
2386 | var embedCount = 0;
2387 | var embeds = this._getAllNodesWithTag(node, ['object', 'embed', 'iframe']);
2388 |
2389 | for (var i = 0; i < embeds.length; i++) {
2390 | // If this embed has attribute that matches video regex, don't delete it.
2391 | for (var j = 0; j < embeds[i].attributes.length; j++) {
2392 | if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
2393 | return false;
2394 | }
2395 | }
2396 |
2397 | // For embed with <object> tag, check inner HTML as well.
2398 | if (
2399 | embeds[i].tagName === 'object' &&
2400 | this._allowedVideoRegex.test(embeds[i].innerHTML)
2401 | ) {
2402 | return false;
2403 | }
2404 |
2405 | embedCount++;
2406 | }
2407 |
2408 | var innerText = this._getInnerText(node);
2409 |
2410 | // toss any node whose inner text contains nothing but suspicious words
2411 | if (this.REGEXPS.adWords.test(innerText) || this.REGEXPS.loadingWords.test(innerText)) {
2412 | return true;
2413 | }
2414 |
2415 | var contentLength = innerText.length;
2416 | var linkDensity = this._getLinkDensity(node);
2417 | var textishTags = ['SPAN', 'LI', 'TD'].concat(Array.from(this.DIV_TO_P_ELEMS));
2418 | var textDensity = this._getTextDensity(node, textishTags);
2419 | var isFigureChild = this._hasAncestorTag(node, 'figure');
2420 |
2421 | // apply shadiness checks, then check for exceptions
2422 | const shouldRemoveNode = () => {
2423 | const errs = [];
2424 | if (!isFigureChild && img > 1 && p / img < 0.5) {
2425 | errs.push(`Bad p to img ratio (img=${img}, p=${p})`);
2426 | }
2427 | if (!isList && li > p) {
2428 | errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`);
2429 | }
2430 | if (input > Math.floor(p / 3)) {
2431 | errs.push(`Too many inputs per p. (input=${input}, p=${p})`);
2432 | }
2433 | if (
2434 | !isList &&
2435 | !isFigureChild &&
2436 | headingDensity < 0.9 &&
2437 | contentLength < 25 &&
2438 | (img === 0 || img > 2) &&
2439 | linkDensity > 0
2440 | ) {
2441 | errs.push(
2442 | `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})`,
2443 | );
2444 | }
2445 | if (!isList && weight < 25 && linkDensity > 0.2 + this._linkDensityModifier) {
2446 | errs.push(`Low weight and a little linky. (linkDensity=${linkDensity})`);
2447 | }
2448 | if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) {
2449 | errs.push(`High weight and mostly links. (linkDensity=${linkDensity})`);
2450 | }
2451 | if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
2452 | errs.push(
2453 | `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`,
2454 | );
2455 | }
2456 | if (img === 0 && textDensity === 0) {
2457 | errs.push(`No useful content. (img=${img}, textDensity=${textDensity})`);
2458 | }
2459 |
2460 | if (errs.length) {
2461 | this.log('Checks failed', errs);
2462 | return true;
2463 | }
2464 |
2465 | return false;
2466 | };
2467 |
2468 | var haveToRemove = shouldRemoveNode();
2469 |
2470 | // Allow simple lists of images to remain in pages
2471 | if (isList && haveToRemove) {
2472 | for (var x = 0; x < node.children.length; x++) {
2473 | let child = node.children[x];
2474 | // Don't filter in lists with li's that contain more than one child
2475 | if (child.children.length > 1) {
2476 | return haveToRemove;
2477 | }
2478 | }
2479 | let li_count = node.getElementsByTagName('li').length;
2480 | // Only allow the list to remain if every li contains an image
2481 | if (img == li_count) {
2482 | return false;
2483 | }
2484 | }
2485 | return haveToRemove;
2486 | }
2487 | return false;
2488 | });
2489 | },
2490 |
2491 | /**
2492 | * Clean out elements that match the specified conditions
2493 | *
2494 | * @param Element
2495 | * @param Function determines whether a node should be removed
2496 | * @return void
2497 | **/
2498 | _cleanMatchedNodes(e, filter) {
2499 | var endOfSearchMarkerNode = this._getNextNode(e, true);
2500 | var next = this._getNextNode(e);
2501 | while (next && next != endOfSearchMarkerNode) {
2502 | if (filter.call(this, next, next.className + ' ' + next.id)) {
2503 | next = this._removeAndGetNext(next);
2504 | } else {
2505 | next = this._getNextNode(next);
2506 | }
2507 | }
2508 | },
2509 |
2510 | /**
2511 | * Clean out spurious headers from an Element.
2512 | *
2513 | * @param Element
2514 | * @return void
2515 | **/
2516 | _cleanHeaders(e) {
2517 | let headingNodes = this._getAllNodesWithTag(e, ['h1', 'h2']);
2518 | this._removeNodes(headingNodes, function (node) {
2519 | let shouldRemove = this._getClassWeight(node) < 0;
2520 | if (shouldRemove) {
2521 | this.log('Removing header with low class weight:', node);
2522 | }
2523 | return shouldRemove;
2524 | });
2525 | },
2526 |
2527 | /**
2528 | * Check if this node is an H1 or H2 element whose content is mostly
2529 | * the same as the article title.
2530 | *
2531 | * @param Element the node to check.
2532 | * @return boolean indicating whether this is a title-like header.
2533 | */
2534 | _headerDuplicatesTitle(node) {
2535 | if (node.tagName != 'H1' && node.tagName != 'H2') {
2536 | return false;
2537 | }
2538 | var heading = this._getInnerText(node, false);
2539 | this.log('Evaluating similarity of header:', heading, this._articleTitle);
2540 | return this._textSimilarity(this._articleTitle, heading) > 0.75;
2541 | },
2542 |
2543 | _flagIsActive(flag) {
2544 | return (this._flags & flag) > 0;
2545 | },
2546 |
2547 | _removeFlag(flag) {
2548 | this._flags = this._flags & ~flag;
2549 | },
2550 |
2551 | _isProbablyVisible(node) {
2552 | // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
2553 | return (
2554 | (!node.style || node.style.display != 'none') &&
2555 | (!node.style || node.style.visibility != 'hidden') &&
2556 | !node.hasAttribute('hidden') &&
2557 | //check for "fallback-image" so that wikimedia math images are displayed
2558 | (!node.hasAttribute('aria-hidden') ||
2559 | node.getAttribute('aria-hidden') != 'true' ||
2560 | (node.className && node.className.includes && node.className.includes('fallback-image')))
2561 | );
2562 | },
2563 |
2564 | /**
2565 | * Runs readability.
2566 | *
2567 | * Workflow:
2568 | * 1. Prep the document by removing script tags, css, etc.
2569 | * 2. Build readability's DOM tree.
2570 | * 3. Grab the article content from the current dom tree.
2571 | * 4. Replace the current DOM tree with the new one.
2572 | * 5. Read peacefully.
2573 | *
2574 | * @return void
2575 | **/
2576 | parse() {
2577 | // Avoid parsing too large documents, as per configuration option
2578 | if (this._maxElemsToParse > 0) {
2579 | var numTags = this._doc.getElementsByTagName('*').length;
2580 | if (numTags > this._maxElemsToParse) {
2581 | throw new Error('Aborting parsing document; ' + numTags + ' elements found');
2582 | }
2583 | }
2584 |
2585 | // Unwrap image from noscript
2586 | this._unwrapNoscriptImages(this._doc);
2587 |
2588 | // Extract JSON-LD metadata before removing scripts
2589 | var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
2590 |
2591 | // Remove script tags from the document.
2592 | this._removeScripts(this._doc);
2593 |
2594 | this._prepDocument();
2595 |
2596 | var metadata = this._getArticleMetadata(jsonLd);
2597 | this._metadata = metadata;
2598 | this._articleTitle = metadata.title;
2599 |
2600 | var articleContent = this._grabArticle();
2601 | if (!articleContent) {
2602 | return null;
2603 | }
2604 |
2605 | this.log('Grabbed: ' + articleContent.innerHTML);
2606 |
2607 | this._postProcessContent(articleContent);
2608 |
2609 | // If we haven't found an excerpt in the article's metadata, use the article's
2610 | // first paragraph as the excerpt. This is used for displaying a preview of
2611 | // the article's content.
2612 | if (!metadata.excerpt) {
2613 | var paragraphs = articleContent.getElementsByTagName('p');
2614 | if (paragraphs.length) {
2615 | metadata.excerpt = paragraphs[0].textContent.trim();
2616 | }
2617 | }
2618 |
2619 | var textContent = articleContent.textContent;
2620 | return {
2621 | title: this._articleTitle,
2622 | byline: metadata.byline || this._articleByline,
2623 | dir: this._articleDir,
2624 | lang: this._articleLang,
2625 | content: this._serializer(articleContent),
2626 | textContent,
2627 | length: textContent.length,
2628 | excerpt: metadata.excerpt,
2629 | siteName: metadata.siteName || this._articleSiteName,
2630 | publishedTime: metadata.publishedTime,
2631 | };
2632 | },
2633 | };
2634 |
2635 | if (typeof module === 'object') {
2636 | /* global module */
2637 | module.exports = Readability;
2638 | }
2639 |
2640 | /**
2641 | * Web Fetcher Helper Content Script
2642 | * Handles fetching HTML content, text content, and interactive elements from the current page
2643 | * Supports Readability for better content extraction
2644 | */
2645 |
2646 | // Configuration
2647 | const config = {
2648 | // Elements that should be ignored when extracting content (used for iframe content and fallback extraction)
2649 | ignoreElements: [
2650 | 'nav',
2651 | 'header:not(article header)',
2652 | 'footer:not(article footer)',
2653 | 'aside',
2654 | 'script',
2655 | 'style',
2656 | 'noscript',
2657 | 'iframe[src*="ads"]',
2658 | '.cookie-notice',
2659 | '.ad',
2660 | '.ads',
2661 | '.advertisement',
2662 | '.banner',
2663 | '.popup',
2664 | '.modal',
2665 | '.overlay',
2666 | '.social-share',
2667 | '.social-links',
2668 | '.related-articles',
2669 | '.comments',
2670 | ],
2671 | minTextLength: 20,
2672 | maxTotalLength: 100000,
2673 | minParagraphLength: 2,
2674 | };
2675 |
2676 | // Listen for messages from the extension
2677 | chrome.runtime.onMessage.addListener((request, _sender, sendResponse) => {
2678 | const pingActions = ['search_tabs_content_ping', 'chrome_web_fetcher_ping'];
2679 | // Respond to ping message
2680 | if (pingActions.includes(request.action)) {
2681 | sendResponse({ status: 'pong' });
2682 | return false; // Synchronous response
2683 | }
2684 |
2685 | // Get HTML content
2686 | else if (request.action === 'getHtmlContent') {
2687 | try {
2688 | let rawHtml;
2689 |
2690 | // If selector is specified, only get content from the matching element
2691 | if (request.selector) {
2692 | const element = document.querySelector(request.selector);
2693 | if (element) {
2694 | rawHtml = element.outerHTML;
2695 | } else {
2696 | throw new Error(`No element found matching selector: ${request.selector}`);
2697 | }
2698 | } else {
2699 | // Otherwise get the entire page content
2700 | rawHtml = document.documentElement.outerHTML;
2701 | }
2702 |
2703 | const cleanedHtml = cleanHtmlContent(rawHtml);
2704 |
2705 | sendResponse({
2706 | success: true,
2707 | htmlContent: cleanedHtml,
2708 | selector: request.selector,
2709 | });
2710 | } catch (error) {
2711 | sendResponse({
2712 | success: false,
2713 | error: `Failed to get HTML content: ${error.message}`,
2714 | });
2715 | }
2716 | }
2717 |
2718 | // Get text content
2719 | else if (request.action === 'getTextContent') {
2720 | try {
2721 | // If selector is specified, only get content from the matching element
2722 | if (request.selector) {
2723 | const element = document.querySelector(request.selector);
2724 | if (element) {
2725 | // Directly get the text content of the element
2726 | const textContent = element.innerText;
2727 |
2728 | sendResponse({
2729 | success: true,
2730 | textContent: textContent,
2731 | selector: request.selector,
2732 | });
2733 | } else {
2734 | throw new Error(`No element found matching selector: ${request.selector}`);
2735 | }
2736 | } else {
2737 | // Otherwise use Readability to extract the main content
2738 | const documentClone = document.cloneNode(true);
2739 |
2740 | const reader = new Readability(documentClone);
2741 | const article = reader.parse();
2742 |
2743 | if (article && article.textContent) {
2744 | // Get metadata
2745 | const metadata = extractPageMetadata();
2746 |
2747 | // Get iframe content if available
2748 | const iframeContent = extractIframeContent();
2749 |
2750 | // Combine content
2751 | let fullContent = article.textContent;
2752 | if (iframeContent && iframeContent.trim().length > config.minTextLength) {
2753 | fullContent += '\n\n--- Embedded Content ---\n\n' + iframeContent;
2754 | }
2755 |
2756 | // Clean content
2757 | fullContent = cleanContent(fullContent);
2758 |
2759 | sendResponse({
2760 | success: true,
2761 | textContent: fullContent,
2762 | article: {
2763 | title: article.title,
2764 | byline: article.byline,
2765 | siteName: article.siteName,
2766 | excerpt: article.excerpt,
2767 | lang: article.lang,
2768 | content: article.content, // HTML content
2769 | },
2770 | metadata: metadata,
2771 | });
2772 | } else {
2773 | // Fallback to basic extraction
2774 | const textContent = document.body.innerText;
2775 | sendResponse({
2776 | success: true,
2777 | textContent: textContent,
2778 | fallback: true,
2779 | });
2780 | }
2781 | }
2782 | } catch (error) {
2783 | console.error('Error extracting text content:', error);
2784 | sendResponse({
2785 | success: false,
2786 | error: `Failed to extract text content: ${error.message}`,
2787 | });
2788 | }
2789 |
2790 | return true; // Async response
2791 | }
2792 |
2793 | // Interactive elements feature has been removed
2794 |
2795 | return true; // Async response
2796 | });
2797 |
2798 | /**
2799 | * Extract metadata from the page
2800 | * @returns {Object} - Page metadata
2801 | */
2802 | function extractPageMetadata() {
2803 | const metadata = {
2804 | title: document.title,
2805 | description: '',
2806 | author: '',
2807 | keywords: '',
2808 | published: '',
2809 | siteName: '',
2810 | };
2811 |
2812 | // Extract description
2813 | const descriptionElement = document.querySelector(
2814 | 'meta[name="description"], meta[property="og:description"]',
2815 | );
2816 | if (descriptionElement) {
2817 | metadata.description = descriptionElement.getAttribute('content') || '';
2818 | }
2819 |
2820 | // Extract author
2821 | const authorElement = document.querySelector(
2822 | 'meta[name="author"], meta[property="article:author"]',
2823 | );
2824 | if (authorElement) {
2825 | metadata.author = authorElement.getAttribute('content') || '';
2826 | }
2827 |
2828 | // Extract keywords
2829 | const keywordsElement = document.querySelector('meta[name="keywords"]');
2830 | if (keywordsElement) {
2831 | metadata.keywords = keywordsElement.getAttribute('content') || '';
2832 | }
2833 |
2834 | // Extract published date
2835 | const publishedElement = document.querySelector(
2836 | 'meta[property="article:published_time"], time[datetime]',
2837 | );
2838 | if (publishedElement) {
2839 | metadata.published =
2840 | publishedElement.getAttribute('content') || publishedElement.getAttribute('datetime') || '';
2841 | }
2842 |
2843 | // Extract site name
2844 | const siteNameElement = document.querySelector('meta[property="og:site_name"]');
2845 | if (siteNameElement) {
2846 | metadata.siteName = siteNameElement.getAttribute('content') || '';
2847 | }
2848 |
2849 | return metadata;
2850 | }
2851 |
2852 | /**
2853 | * Extract content from iframes
2854 | * @returns {string} - Combined iframe content
2855 | */
2856 | function extractIframeContent() {
2857 | let allIframeText = '';
2858 | const iframes = document.querySelectorAll('iframe');
2859 |
2860 | for (const iframe of iframes) {
2861 | try {
2862 | if (isSameOrigin(iframe) && isElementVisible(iframe)) {
2863 | const doc = iframe.contentDocument || iframe.contentWindow?.document;
2864 | if (doc) {
2865 | const iframeText = doc.body.innerText;
2866 | if (iframeText && iframeText.trim().length >= config.minTextLength) {
2867 | allIframeText += iframeText.trim() + '\n\n';
2868 | }
2869 | }
2870 | }
2871 | } catch (error) {
2872 | console.warn(
2873 | `Cannot access iframe content (possible cross-origin restriction): ${error.message}`,
2874 | );
2875 | }
2876 | }
2877 |
2878 | return allIframeText.trim();
2879 | }
2880 |
2881 | /**
2882 | * Check if iframe is same origin
2883 | * @param {HTMLIFrameElement} iframe - The iframe to check
2884 | * @returns {boolean} - Whether the iframe is same origin
2885 | */
2886 | function isSameOrigin(iframe) {
2887 | try {
2888 | return Boolean(iframe.contentDocument || iframe.contentWindow?.document);
2889 | } catch (e) {
2890 | return false;
2891 | }
2892 | }
2893 |
2894 | /**
2895 | * Clean content text
2896 | * @param {string} text - The text to clean
2897 | * @returns {string} - Cleaned text
2898 | */
2899 | function cleanContent(text) {
2900 | return text
2901 | .replace(/\s+/g, ' ')
2902 | .replace(/\n\s*\n/g, '\n\n')
2903 | .trim()
2904 | .substring(0, config.maxTotalLength);
2905 | }
2906 |
2907 | /**
2908 | * Clean HTML content by removing style tags and their content
2909 | * @param {string} html - The HTML content to clean
2910 | * @returns {string} - Cleaned HTML content
2911 | */
2912 | function cleanHtmlContent(html) {
2913 | // Create a new document parser
2914 | const parser = new DOMParser();
2915 | const doc = parser.parseFromString(html, 'text/html');
2916 |
2917 | // Remove all style tags
2918 | const styleElements = doc.querySelectorAll('style');
2919 | styleElements.forEach((element) => {
2920 | if (element.parentNode) {
2921 | element.parentNode.removeChild(element);
2922 | }
2923 | });
2924 |
2925 | // Remove all inline style attributes
2926 | const allElementsWithStyle = doc.querySelectorAll('*');
2927 | allElementsWithStyle.forEach((element) => {
2928 | element.removeAttribute('style');
2929 | });
2930 |
2931 | // Remove all link tags
2932 | const linkElements = doc.querySelectorAll('link');
2933 | linkElements.forEach((element) => {
2934 | if (element.parentNode) {
2935 | element.parentNode.removeChild(element);
2936 | }
2937 | });
2938 |
2939 | // Remove all script tags
2940 | const scriptElements = doc.querySelectorAll('script');
2941 | scriptElements.forEach((element) => {
2942 | if (element.parentNode) {
2943 | element.parentNode.removeChild(element);
2944 | }
2945 | });
2946 |
2947 | // Replace all SVG elements with placeholders
2948 | const svgElements = doc.querySelectorAll('svg');
2949 | svgElements.forEach((element) => {
2950 | if (element.parentNode) {
2951 | // Create a placeholder element
2952 | const placeholder = doc.createElement('span');
2953 | placeholder.textContent = '[SVG Icon]';
2954 | placeholder.setAttribute('data-placeholder', 'svg-icon');
2955 |
2956 | // Replace SVG element
2957 | element.parentNode.replaceChild(placeholder, element);
2958 | }
2959 | });
2960 |
2961 | // Replace all SVG images and objects
2962 | const svgImages = doc.querySelectorAll(
2963 | 'img[src$=".svg"], object[data$=".svg"], embed[src$=".svg"]',
2964 | );
2965 | svgImages.forEach((element) => {
2966 | if (element.parentNode) {
2967 | // Create a placeholder element
2968 | const placeholder = doc.createElement('span');
2969 | placeholder.textContent = '[SVG Image]';
2970 | placeholder.setAttribute('data-placeholder', 'svg-image');
2971 | if (element.alt) {
2972 | placeholder.textContent = `[SVG Image: ${element.alt}]`;
2973 | }
2974 |
2975 | // Replace SVG image element
2976 | element.parentNode.replaceChild(placeholder, element);
2977 | }
2978 | });
2979 |
2980 | // Remove elements with only data-* attributes, no children, and no class or style
2981 | const allElements = Array.from(doc.querySelectorAll('*'));
2982 | allElements.forEach((element) => {
2983 | // Check if element has only data-* attributes
2984 | let hasOnlyDataAttributes = true;
2985 | let hasDataAttribute = false;
2986 |
2987 | // Check all attributes
2988 | for (let i = 0; i < element.attributes.length; i++) {
2989 | const attr = element.attributes[i];
2990 | if (attr.name.startsWith('data-')) {
2991 | hasDataAttribute = true;
2992 | } else if (attr.name !== 'id') {
2993 | // Allow id attribute
2994 | hasOnlyDataAttributes = false;
2995 | break;
2996 | }
2997 | }
2998 |
2999 | // If element has only data-* attributes, no children, and no text content
3000 | if (
3001 | hasOnlyDataAttributes &&
3002 | hasDataAttribute &&
3003 | element.children.length === 0 &&
3004 | element.textContent.trim() === ''
3005 | ) {
3006 | // Remove the element
3007 | if (element.parentNode) {
3008 | element.parentNode.removeChild(element);
3009 | }
3010 | }
3011 | });
3012 |
3013 | // Remove all HTML comments
3014 | const removeComments = (node) => {
3015 | const childNodes = node.childNodes;
3016 | for (let i = childNodes.length - 1; i >= 0; i--) {
3017 | const child = childNodes[i];
3018 | if (child.nodeType === 8) {
3019 | // Comment node
3020 | node.removeChild(child);
3021 | } else if (child.nodeType === 1) {
3022 | // Element node
3023 | removeComments(child);
3024 | }
3025 | }
3026 | };
3027 | removeComments(doc);
3028 |
3029 | // Return cleaned HTML
3030 | return new XMLSerializer().serializeToString(doc);
3031 | }
3032 |
3033 | // Interactive elements feature has been removed
3034 |
3035 | // Selector generation feature has been removed
3036 | }
3037 |
```