This is page 3 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl/crawlers/warc/tests.html?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/siteone/crawler.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.siteone.crawler — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
48 | </ul>
49 |
50 | </div>
51 | </div>
52 | </nav>
53 |
54 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
55 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
56 | <a href="../../../../index.html">mcp-server-webcrawl</a>
57 | </nav>
58 |
59 | <div class="wy-nav-content">
60 | <div class="rst-content">
61 | <div role="navigation" aria-label="Page navigation">
62 | <ul class="wy-breadcrumbs">
63 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
64 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
65 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
66 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.siteone.crawler</li>
67 | <li class="wy-breadcrumbs-aside">
68 | </li>
69 | </ul>
70 | <hr/>
71 | </div>
72 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
73 | <div itemprop="articleBody">
74 |
75 | <h1>Source code for mcp_server_webcrawl.crawlers.siteone.crawler</h1><div class="highlight"><pre>
76 | <span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
77 |
78 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.siteone.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
81 |
82 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
83 |
84 | <div class="viewcode-block" id="SiteOneCrawler">
85 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.siteone.html#mcp_server_webcrawl.crawlers.siteone.crawler.SiteOneCrawler">[docs]</a>
86 | <span class="k">class</span> <span class="nc">SiteOneCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
87 | <span class="w"> </span><span class="sd">"""</span>
88 | <span class="sd"> A crawler implementation for SiteOne captured sites.</span>
89 | <span class="sd"> Provides functionality for accessing and searching web content from SiteOne captures.</span>
90 | <span class="sd"> SiteOne merges a wget archive with a custom SiteOne generated log to aquire more</span>
91 | <span class="sd"> fields than wget can alone.</span>
92 | <span class="sd"> """</span>
93 |
94 | <div class="viewcode-block" id="SiteOneCrawler.__init__">
95 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.siteone.html#mcp_server_webcrawl.crawlers.siteone.crawler.SiteOneCrawler.__init__">[docs]</a>
96 | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
97 | <span class="w"> </span><span class="sd">"""</span>
98 | <span class="sd"> Initialize the SiteOne crawler with a data source directory.</span>
99 |
100 | <span class="sd"> Args:</span>
101 | <span class="sd"> datasrc: The input argument as Path, it must be a directory containing</span>
102 | <span class="sd"> SiteOne captures organized as subdirectories</span>
103 |
104 | <span class="sd"> Raises:</span>
105 | <span class="sd"> AssertionError: If datasrc is None or not a directory</span>
106 | <span class="sd"> """</span>
107 | <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"SiteOneCrawler needs a datasrc, regardless of action"</span>
108 | <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">"SiteOneCrawler datasrc must be a directory"</span>
109 |
110 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
111 | </div>
112 |
113 | </pre></div>
114 |
115 | </div>
116 | </div>
117 | <footer>
118 |
119 | <hr/>
120 |
121 | <div role="contentinfo">
122 | <p>© Copyright 2025, pragmar.</p>
123 | </div>
124 |
125 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
126 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
127 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
128 |
129 |
130 | </footer>
131 | </div>
132 | </div>
133 | </section>
134 | </div>
135 | <script>
136 | jQuery(function () {
137 | SphinxRtdTheme.Navigation.enable(true);
138 | });
139 | </script>
140 |
141 | </body>
142 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/adapter.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import sqlite3
3 | import traceback
4 | import re
5 |
6 | from datetime import timezone
7 | from contextlib import closing
8 | from datetime import datetime
9 | from pathlib import Path
10 |
11 | from mcp_server_webcrawl.crawlers.base.adapter import (
12 | BaseManager,
13 | IndexState,
14 | IndexStatus,
15 | SitesGroup,
16 | INDEXED_BATCH_SIZE,
17 | INDEXED_RESOURCE_DEFAULT_PROTOCOL,
18 | INDEXED_TYPE_MAPPING,
19 | INDEXED_IGNORE_DIRECTORIES,
20 | )
21 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
22 | from mcp_server_webcrawl.models.resources import (
23 | ResourceResult,
24 | ResourceResultType,
25 | RESOURCES_LIMIT_DEFAULT,
26 | )
27 | from mcp_server_webcrawl.models.sites import (
28 | SiteResult,
29 | )
30 | from mcp_server_webcrawl.utils.logger import get_logger
31 |
32 | logger = get_logger()
33 |
34 |
35 | class WgetManager(IndexedManager):
36 | """
37 | Manages wget directory data in in-memory SQLite databases.
38 | Provides connection pooling and caching for efficient access.
39 | """
40 |
41 | def __init__(self) -> None:
42 | """Initialize the wget manager with empty cache and statistics."""
43 | super().__init__()
44 |
45 | def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
46 | site_id: int, index_state: IndexState = None) -> None:
47 | """
48 | Load a wget directory into the database with parallel processing and batch SQL insertions.
49 |
50 | Args:
51 | connection: SQLite connection
52 | directory: path to the wget directory
53 | site_id: id for the site
54 | index_state: indexState object for tracking progress
55 | """
56 | if not directory.exists() or not directory.is_dir():
57 | logger.error(f"Directory not found or not a directory: {directory}")
58 | return
59 |
60 | if index_state is not None:
61 | index_state.set_status(IndexStatus.INDEXING)
62 |
63 | # collect files to process
64 | file_paths = []
65 | for root, _, files in os.walk(directory):
66 | for filename in files:
67 | if filename == "robots.txt":
68 | continue
69 |
70 | rel_path = Path(root).relative_to(directory)
71 | ignore_file = False
72 | for ignore_dir in INDEXED_IGNORE_DIRECTORIES:
73 | if ignore_dir in str(rel_path):
74 | ignore_file = True
75 | break
76 |
77 | if not ignore_file:
78 | file_paths.append(Path(root) / filename)
79 |
80 | # each crawler a litle different
81 | with closing(connection.cursor()) as cursor:
82 | for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
83 | if index_state is not None and index_state.is_timeout():
84 | index_state.set_status(IndexStatus.PARTIAL)
85 | return
86 |
87 | batch_file_paths: list[Path] = file_paths[i:i+INDEXED_BATCH_SIZE]
88 | batch_file_contents = BaseManager.read_files(batch_file_paths)
89 | batch_insert_resource_results: list[ResourceResult] = []
90 | for file_path, content in batch_file_contents.items():
91 | try:
92 | result: ResourceResult = self._prepare_wget_record(file_path, site_id, directory, content)
93 | if result:
94 | batch_insert_resource_results.append(result)
95 | if index_state is not None:
96 | index_state.increment_processed()
97 | except Exception as ex:
98 | logger.error(f"Error processing file {file_path}: {ex}\n{traceback.format_exc()}")
99 |
100 | self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
101 |
102 | if index_state is not None and index_state.status == IndexStatus.INDEXING:
103 | index_state.set_status(IndexStatus.COMPLETE)
104 |
105 | def _prepare_wget_record(self, file_path: Path, site_id: int, base_dir: Path, content: str = None) -> ResourceResult | None:
106 | """
107 | Prepare a record for batch insertion from a wget file.
108 |
109 | Args:
110 | file_path: path to the wget file
111 | site_id: id for the site
112 | base_dir: base directory for the wget capture
113 | content: optional pre-loaded file content
114 |
115 | Returns:
116 | Tuple of values ready for insertion, or None if processing fails
117 | """
118 | try:
119 | relative_path = file_path.relative_to(base_dir)
120 | url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{base_dir.name}/{str(relative_path).replace(os.sep, '/')}"
121 |
122 | # wget is creating ./index.html from ./ in most cases. eliminate it to preserve homepage sort
123 | # which is way more important than the (wget manufactured) filename reference
124 | url = re.sub(r"/index\.html($|\?)", r"/\1", url)
125 |
126 | decruftified_path = BaseManager.decruft_path(str(file_path))
127 | extension = Path(decruftified_path).suffix.lower()
128 | resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
129 | file_stat = file_path.stat()
130 | file_size = file_stat.st_size
131 | file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
132 | file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
133 |
134 | # use pre-loaded content if available, otherwise rely on read_file_contents
135 | file_content = content
136 | if file_content is None:
137 | file_content = BaseManager.read_file_contents(file_path, resource_type)
138 |
139 | return ResourceResult(
140 | id=BaseManager.string_to_id(url),
141 | site=site_id,
142 | created=file_created,
143 | modified=file_modified,
144 | url=url,
145 | type=resource_type,
146 | status=200,
147 | headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
148 | content=file_content,
149 | size=file_size,
150 | time=0,
151 | )
152 | except Exception as ex:
153 | logger.error(f"Error preparing record for file {file_path}: {ex}")
154 | return None
155 |
156 |
157 | manager: WgetManager = WgetManager()
158 |
159 | def get_sites(
160 | datasrc: Path,
161 | ids: list[int] | None = None,
162 | fields: list[str] | None = None
163 | ) -> list[SiteResult]:
164 | """
165 | List site directories in the datasrc directory as sites.
166 |
167 | Args:
168 | datasrc: path to the directory containing site subdirectories
169 | ids: optional list of site IDs to filter by
170 | fields: optional list of fields to include in the response
171 |
172 | Returns:
173 | List of SiteResult objects, one for each site directory
174 |
175 | Notes:
176 | Returns an empty list if the datasrc directory doesn't exist.
177 | """
178 | return manager.get_sites_for_directories(datasrc, ids, fields)
179 |
180 | def get_resources(
181 | datasrc: Path,
182 | sites: list[int] | None = None,
183 | query: str = "",
184 | fields: list[str] | None = None,
185 | sort: str | None = None,
186 | limit: int = RESOURCES_LIMIT_DEFAULT,
187 | offset: int = 0,
188 |
189 | ) -> tuple[list[ResourceResult], int, IndexState]:
190 | """
191 | Get resources from wget directories using in-memory SQLite.
192 |
193 | Args:
194 | datasrc: path to the directory containing wget captures
195 | sites: optional list of site IDs to filter by
196 | query: search query string
197 | fields: optional list of fields to include in response
198 | sort: sort order for results
199 | limit: maximum number of results to return
200 | offset: number of results to skip for pagination
201 |
202 | Returns:
203 | Tuple of (list of ResourceResult objects, total count)
204 | """
205 | sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
206 | assert sites_results, "At least one site is required to search"
207 | site_paths = [site.path for site in sites_results]
208 | sites_group = SitesGroup(datasrc, sites, site_paths)
209 | return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
210 |
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.utils — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../genindex.html" />
20 | <link rel="search" title="Search" href="../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../modules.html">mcp_server_webcrawl</a></li>
48 | </ul>
49 |
50 | </div>
51 | </div>
52 | </nav>
53 |
54 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
55 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
56 | <a href="../../index.html">mcp-server-webcrawl</a>
57 | </nav>
58 |
59 | <div class="wy-nav-content">
60 | <div class="rst-content">
61 | <div role="navigation" aria-label="Page navigation">
62 | <ul class="wy-breadcrumbs">
63 | <li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
64 | <li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
65 | <li class="breadcrumb-item active">mcp_server_webcrawl.utils</li>
66 | <li class="wy-breadcrumbs-aside">
67 | </li>
68 | </ul>
69 | <hr/>
70 | </div>
71 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
72 | <div itemprop="articleBody">
73 |
74 | <h1>Source code for mcp_server_webcrawl.utils</h1><div class="highlight"><pre>
75 | <span></span><span class="kn">import</span> <span class="nn">re</span>
76 |
77 | <span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
78 |
79 | <div class="viewcode-block" id="to_isoformat_zulu">
80 | <a class="viewcode-back" href="../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.to_isoformat_zulu">[docs]</a>
81 | <span class="k">def</span> <span class="nf">to_isoformat_zulu</span><span class="p">(</span><span class="n">dt</span><span class="p">:</span> <span class="n">datetime</span><span class="p">):</span>
82 | <span class="w"> </span><span class="sd">"""</span>
83 | <span class="sd"> Convert datetime to iso Z.</span>
84 |
85 | <span class="sd"> python<=3.10 struggles with Z and fractions of seconds, will</span>
86 | <span class="sd"> throw. smooth out the iso string, second precision isn't key here</span>
87 | <span class="sd"> """</span>
88 | <span class="k">return</span> <span class="n">dt</span><span class="o">.</span><span class="n">isoformat</span><span class="p">()</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">"+00:00"</span><span class="p">,</span> <span class="s2">"Z"</span><span class="p">)</span></div>
89 |
90 |
91 | <div class="viewcode-block" id="from_isoformat_zulu">
92 | <a class="viewcode-back" href="../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.from_isoformat_zulu">[docs]</a>
93 | <span class="k">def</span> <span class="nf">from_isoformat_zulu</span><span class="p">(</span><span class="n">dt_string</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">datetime</span><span class="p">:</span>
94 | <span class="w"> </span><span class="sd">"""</span>
95 | <span class="sd"> Convert ISO string to datetime.</span>
96 |
97 | <span class="sd"> python<=3.10 struggles with Z and fractions of seconds, will</span>
98 | <span class="sd"> throw. smooth out the iso string, second precision isn't key here</span>
99 | <span class="sd"> """</span>
100 |
101 | <span class="k">if</span> <span class="ow">not</span> <span class="n">dt_string</span><span class="p">:</span>
102 | <span class="k">return</span> <span class="kc">None</span>
103 | <span class="n">dt_string</span> <span class="o">=</span> <span class="n">dt_string</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">"Z"</span><span class="p">,</span> <span class="s2">"+00:00"</span><span class="p">)</span>
104 | <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="sa">r</span><span class="s2">"(.*\.\d</span><span class="si">{6}</span><span class="s2">)\d*([-+]\d</span><span class="si">{2}</span><span class="s2">:\d</span><span class="si">{2}</span><span class="s2">|$)"</span><span class="p">,</span> <span class="n">dt_string</span><span class="p">)</span>
105 | <span class="k">if</span> <span class="n">match</span><span class="p">:</span>
106 | <span class="n">dt_string</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="ow">or</span> <span class="s2">""</span><span class="p">)</span>
107 | <span class="k">return</span> <span class="n">datetime</span><span class="o">.</span><span class="n">fromisoformat</span><span class="p">(</span><span class="n">dt_string</span><span class="p">)</span></div>
108 |
109 | </pre></div>
110 |
111 | </div>
112 | </div>
113 | <footer>
114 |
115 | <hr/>
116 |
117 | <div role="contentinfo">
118 | <p>© Copyright 2025, pragmar.</p>
119 | </div>
120 |
121 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
122 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
123 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
124 |
125 |
126 | </footer>
127 | </div>
128 | </div>
129 | </section>
130 | </div>
131 | <script>
132 | jQuery(function () {
133 | SphinxRtdTheme.Navigation.enable(true);
134 | });
135 | </script>
136 |
137 | </body>
138 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/httrack/crawler.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.httrack.crawler — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.httrack.crawler</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.crawlers.httrack.crawler</h1><div class="highlight"><pre>
77 | <span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
78 |
79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.httrack.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
82 |
83 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
84 |
85 | <div class="viewcode-block" id="HtTrackCrawler">
86 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.httrack.html#mcp_server_webcrawl.crawlers.httrack.crawler.HtTrackCrawler">[docs]</a>
87 | <span class="k">class</span> <span class="nc">HtTrackCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
88 | <span class="w"> </span><span class="sd">"""</span>
89 | <span class="sd"> A crawler implementation for HTTrack captured sites.</span>
90 | <span class="sd"> Provides functionality for accessing and searching web content from HTTrack projects.</span>
91 | <span class="sd"> HTTrack creates offline mirrors of websites with preserved directory structure</span>
92 | <span class="sd"> and metadata in hts-log.txt files.</span>
93 | <span class="sd"> """</span>
94 |
95 | <div class="viewcode-block" id="HtTrackCrawler.__init__">
96 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.httrack.html#mcp_server_webcrawl.crawlers.httrack.crawler.HtTrackCrawler.__init__">[docs]</a>
97 | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
98 | <span class="w"> </span><span class="sd">"""</span>
99 | <span class="sd"> Initialize the HTTrack crawler with a data source directory.</span>
100 |
101 | <span class="sd"> Args:</span>
102 | <span class="sd"> datasrc: The input argument as Path, it must be a directory containing</span>
103 | <span class="sd"> HTTrack project directories, each potentially containing multiple domains</span>
104 |
105 | <span class="sd"> Raises:</span>
106 | <span class="sd"> AssertionError: If datasrc is None or not a directory</span>
107 | <span class="sd"> """</span>
108 | <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"HtTrackCrawler needs a datasrc, regardless of action"</span>
109 | <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">"HtTrackCrawler datasrc must be a directory"</span>
110 |
111 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
112 | </div>
113 |
114 | </pre></div>
115 |
116 | </div>
117 | </div>
118 | <footer>
119 |
120 | <hr/>
121 |
122 | <div role="contentinfo">
123 | <p>© Copyright 2025, pragmar.</p>
124 | </div>
125 |
126 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
127 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
128 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
129 |
130 |
131 | </footer>
132 | </div>
133 | </div>
134 | </section>
135 | </div>
136 | <script>
137 | jQuery(function () {
138 | SphinxRtdTheme.Navigation.enable(true);
139 | });
140 | </script>
141 |
142 | </body>
143 | </html>
```
--------------------------------------------------------------------------------
/docs/prompts.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>Prompt Routines — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="next" title="mcp_server_webcrawl" href="modules.html" />
23 | <link rel="prev" title="Usage" href="usage.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
50 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Prompt Routines</a></li>
51 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
52 | </ul>
53 |
54 | </div>
55 | </div>
56 | </nav>
57 |
58 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
59 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
60 | <a href="index.html">mcp-server-webcrawl</a>
61 | </nav>
62 |
63 | <div class="wy-nav-content">
64 | <div class="rst-content">
65 | <div role="navigation" aria-label="Page navigation">
66 | <ul class="wy-breadcrumbs">
67 | <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
68 | <li class="breadcrumb-item active">Prompt Routines</li>
69 | <li class="wy-breadcrumbs-aside">
70 | <a href="_sources/prompts.rst.txt" rel="nofollow"> View page source</a>
71 | </li>
72 | </ul>
73 | <hr/>
74 | </div>
75 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
76 | <div itemprop="articleBody">
77 |
78 | <section id="prompt-routines">
79 | <h1>Prompt Routines<a class="headerlink" href="#prompt-routines" title="Link to this heading"></a></h1>
80 | <p><strong>mcp-server-webcrawl</strong> provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.</p>
81 | <p>It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are <strong>copy and paste</strong>, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.</p>
82 | <p>If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type “run pasted for [site name or URL].” It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).</p>
83 | <table class="docutils align-default">
84 | <thead>
85 | <tr class="row-odd"><th class="head"><p>Prompt</p></th>
86 | <th class="head"><p>Download</p></th>
87 | <th class="head"><p>Category</p></th>
88 | <th class="head"><p>Description</p></th>
89 | </tr>
90 | </thead>
91 | <tbody>
92 | <tr class="row-even"><td><p>🔍 <strong>SEO Audit</strong></p></td>
93 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md">auditseo.md</a></p></td>
94 | <td><p>audit</p></td>
95 | <td><p>Technical SEO (search engine optimization) analysis. Covers the
96 | basics, with options to dive deeper.</p></td>
97 | </tr>
98 | <tr class="row-odd"><td><p>🔗 <strong>404 Audit</strong></p></td>
99 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md">audit404.md</a></p></td>
100 | <td><p>audit</p></td>
101 | <td><p>Broken link detection and pattern analysis. Not only finds issues,
102 | but suggests fixes.</p></td>
103 | </tr>
104 | <tr class="row-even"><td><p>⚡ <strong>Performance Audit</strong></p></td>
105 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md">auditperf.md</a></p></td>
106 | <td><p>audit</p></td>
107 | <td><p>Website speed and optimization analysis. Real talk.</p></td>
108 | </tr>
109 | <tr class="row-odd"><td><p>📁 <strong>File Audit</strong></p></td>
110 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md">auditfiles.md</a></p></td>
111 | <td><p>audit</p></td>
112 | <td><p>File organization and asset analysis. Discover the composition of
113 | your website.</p></td>
114 | </tr>
115 | <tr class="row-even"><td><p>🌐 <strong>Gopher Interface</strong></p></td>
116 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md">gopher.md</a></p></td>
117 | <td><p>interface</p></td>
118 | <td><p>An old-fashioned search interface inspired by the Gopher clients of
119 | yesteryear.</p></td>
120 | </tr>
121 | <tr class="row-odd"><td><p>⚙️ <strong>Search Test</strong></p></td>
122 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md">testsearch.md</a></p></td>
123 | <td><p>self-test</p></td>
124 | <td><p>A battery of tests to check for Boolean logical inconsistencies in
125 | the search query parser and subsequent FTS5 conversion.</p></td>
126 | </tr>
127 | </tbody>
128 | </table>
129 | </section>
130 |
131 |
132 | </div>
133 | </div>
134 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
135 | <a href="usage.html" class="btn btn-neutral float-left" title="Usage" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
136 | <a href="modules.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
137 | </div>
138 |
139 | <hr/>
140 |
141 | <div role="contentinfo">
142 | <p>© Copyright 2025, pragmar.</p>
143 | </div>
144 |
145 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
146 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
147 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
148 |
149 |
150 | </footer>
151 | </div>
152 | </div>
153 | </section>
154 | </div>
155 | <script>
156 | jQuery(function () {
157 | SphinxRtdTheme.Navigation.enable(true);
158 | });
159 | </script>
160 |
161 | </body>
162 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/archivebox/crawler.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.archivebox.crawler — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.archivebox.crawler</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.crawlers.archivebox.crawler</h1><div class="highlight"><pre>
77 | <span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
78 |
79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.archivebox.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
82 |
83 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
84 |
85 | <div class="viewcode-block" id="ArchiveBoxCrawler">
86 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.archivebox.html#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler">[docs]</a>
87 | <span class="k">class</span> <span class="nc">ArchiveBoxCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
88 | <span class="w"> </span><span class="sd">"""</span>
89 | <span class="sd"> A crawler implementation for ArchiveBox archived sites.</span>
90 | <span class="sd"> Provides functionality for accessing and searching web content from ArchiveBox archives.</span>
91 | <span class="sd"> ArchiveBox creates single-URL archives with metadata stored in JSON files</span>
92 | <span class="sd"> and HTML content preserved in index.html files.</span>
93 | <span class="sd"> """</span>
94 |
95 | <div class="viewcode-block" id="ArchiveBoxCrawler.__init__">
96 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.archivebox.html#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler.__init__">[docs]</a>
97 | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
98 | <span class="w"> </span><span class="sd">"""</span>
99 | <span class="sd"> Initialize the ArchiveBox crawler with a data source directory.</span>
100 |
101 | <span class="sd"> Args:</span>
102 | <span class="sd"> datasrc: The input argument as Path, it must be a directory containing</span>
103 | <span class="sd"> ArchiveBox archive directories, each containing individual URL entries</span>
104 |
105 | <span class="sd"> Raises:</span>
106 | <span class="sd"> AssertionError: If datasrc is None or not a directory</span>
107 | <span class="sd"> """</span>
108 | <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"ArchiveBoxCrawler needs a datasrc, regardless of action"</span>
109 | <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">"ArchiveBoxCrawler datasrc must be a directory"</span>
110 |
111 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
112 | </div>
113 |
114 | </pre></div>
115 |
116 | </div>
117 | </div>
118 | <footer>
119 |
120 | <hr/>
121 |
122 | <div role="contentinfo">
123 | <p>© Copyright 2025, pragmar.</p>
124 | </div>
125 |
126 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
127 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
128 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
129 |
130 |
131 | </footer>
132 | </div>
133 | </div>
134 | </section>
135 | </div>
136 | <script>
137 | jQuery(function () {
138 | SphinxRtdTheme.Navigation.enable(true);
139 | });
140 | </script>
141 |
142 | </body>
143 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/adapter.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 | import sqlite3
3 |
4 | from itertools import chain
5 | from contextlib import closing
6 | from pathlib import Path
7 |
8 | from datetime import datetime, timezone
9 |
10 | from mcp_server_webcrawl.crawlers.base.adapter import (
11 | IndexState,
12 | IndexStatus,
13 | BaseManager,
14 | SitesGroup,
15 | INDEXED_BATCH_SIZE,
16 | )
17 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
18 | from mcp_server_webcrawl.models.resources import (
19 | ResourceResult,
20 | ResourceResultType,
21 | RESOURCES_LIMIT_DEFAULT,
22 | )
23 | from mcp_server_webcrawl.models.sites import (
24 | SiteResult,
25 | )
26 | from mcp_server_webcrawl.utils.logger import get_logger
27 |
28 | logger = get_logger()
29 |
30 | KATANA_REGEX_HTTP_STATUS = re.compile(r"HTTP/\d\.\d\s+(\d+)")
31 | KATANA_REGEX_CONTENT_TYPE = re.compile(r"Content-Type:\s*([^\r\n;]+)", re.IGNORECASE)
32 |
33 | class KatanaManager(IndexedManager):
34 | """
35 | Manages HTTP text files in in-memory SQLite databases.
36 | Provides connection pooling and caching for efficient access.
37 | """
38 |
39 | def __init__(self) -> None:
40 | """Initialize the HTTP text manager with empty cache and statistics."""
41 | super().__init__()
42 |
43 | def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
44 | site_id: int, index_state: IndexState = None) -> None:
45 | """
46 | Load a site directory of HTTP text files into the database with parallel reading
47 | and batch SQL insertions.
48 |
49 | Args:
50 | connection: SQLite connection
51 | directory: path to the site directory
52 | site_id: ID for the site
53 | index_state: tracker for FTS indexing status
54 | """
55 |
56 | if not directory.exists() or not directory.is_dir():
57 | logger.error(f"Directory not found or not a directory: {directory}")
58 | return
59 |
60 | if index_state is not None:
61 | index_state.set_status(IndexStatus.INDEXING)
62 |
63 | file_paths = list(chain(
64 | directory.glob("*.txt"),
65 | directory.glob("*/*.txt") # katana stores offsite assets under hostname
66 | ))
67 |
68 | with closing(connection.cursor()) as cursor:
69 | for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
70 | if index_state is not None and index_state.is_timeout():
71 | index_state.set_status(IndexStatus.PARTIAL)
72 | return
73 |
74 | batch_file_paths: list[Path] = file_paths[i:i+INDEXED_BATCH_SIZE]
75 | batch_file_contents = BaseManager.read_files(batch_file_paths)
76 | batch_insert_resource_results: list[ResourceResult] = []
77 | for file_path, content in batch_file_contents.items():
78 | # avoid readme in repo, katana crawl files should be named 9080ef8...
79 | if file_path.name.lower().endswith("readme.txt"):
80 | continue
81 | try:
82 | record = self._prepare_katana_record(file_path, site_id, content)
83 | if record:
84 | batch_insert_resource_results.append(record)
85 | if index_state is not None:
86 | index_state.increment_processed()
87 | except Exception as ex:
88 | logger.error(f"Error processing file {file_path}: {ex}")
89 |
90 | self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
91 |
92 | if index_state is not None and index_state.status == IndexStatus.INDEXING:
93 | index_state.set_status(IndexStatus.COMPLETE)
94 |
95 | def _prepare_katana_record(self, file_path: Path, site_id: int, content: str) -> ResourceResult | None:
96 | """
97 | Prepare a record for batch insertion.
98 |
99 | Args:
100 | file_path: path to the Katana crawl file record
101 | site_id: ID for the site
102 | content: loaded file content
103 |
104 | Returns:
105 | ResourceResult object ready for insertion, or None if processing fails
106 | """
107 | if file_path.is_file():
108 | file_stat = file_path.stat()
109 | # HTTP header modified mostly useless, change my mind
110 | file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
111 | file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
112 | else:
113 | file_created = None
114 | file_modified = None
115 |
116 | # crawl format: <url>\n\n<request>\n\n<headers>...<response>
117 | parts: list[str] = content.split("\n\n", 2)
118 | if len(parts) < 3:
119 | logger.warning(f"Invalid HTTP text format in file {file_path}")
120 | return None
121 |
122 | url: str = parts[0].strip()
123 | response_data: str = parts[2].strip()
124 |
125 | try:
126 | response_parts: list[str] = response_data.split("\n\n", 1)
127 | headers: str = response_parts[0].strip()
128 | body: str = response_parts[1].strip() if len(response_parts) > 1 else ""
129 |
130 | if "Transfer-Encoding: chunked" in headers:
131 | body = body.split("\n", 1)[1].strip() # remove hex prefix
132 | body = body.rsplit("\n0", 1)[0].strip() # remove trailing "0" terminator
133 |
134 | # status from the first line of headers
135 | status_match: str = KATANA_REGEX_HTTP_STATUS.search(headers.split("\n", 2)[0])
136 | status_code: int = int(status_match.group(1)) if status_match else 0
137 |
138 | content_type_match = KATANA_REGEX_CONTENT_TYPE.search(headers)
139 | content_type = content_type_match.group(1).strip() if content_type_match else ""
140 | resource_type = self._determine_resource_type(content_type)
141 | content_size = len(body)
142 | resource_id = BaseManager.string_to_id(url)
143 |
144 | return ResourceResult(
145 | id=resource_id,
146 | site=site_id,
147 | created=file_created,
148 | modified=file_modified,
149 | url=url,
150 | type=resource_type,
151 | headers=headers,
152 | content=body if self._is_text_content(content_type) else None,
153 | status=status_code,
154 | size=content_size,
155 | time=0 # time not available in file or Katana index
156 | )
157 |
158 | except Exception as ex:
159 | logger.error(f"Error processing HTTP response in file {file_path}: {ex}")
160 | return None
161 |
162 | manager: KatanaManager = KatanaManager()
163 |
164 | def get_sites(
165 | datasrc: Path,
166 | ids: list[int] | None = None,
167 | fields: list[str] | None = None
168 | ) -> list[SiteResult]:
169 | """
170 | List site directories in the datasrc directory as sites.
171 |
172 | Args:
173 | datasrc: path to the directory containing site subdirectories
174 | ids: optional list of site IDs to filter by
175 | fields: optional list of fields to include in the response
176 |
177 | Returns:
178 | List of SiteResult objects, one for each site directory
179 |
180 | Notes:
181 | Returns an empty list if the datasrc directory doesn't exist.
182 | """
183 | return manager.get_sites_for_directories(datasrc, ids, fields)
184 |
185 | def get_resources(
186 | datasrc: Path,
187 | ids: list[int] | None = None,
188 | sites: list[int] | None = None,
189 | query: str = "",
190 | types: list[ResourceResultType] | None = None,
191 | fields: list[str] | None = None,
192 | statuses: list[int] | None = None,
193 | sort: str | None = None,
194 | limit: int = RESOURCES_LIMIT_DEFAULT,
195 | offset: int = 0,
196 | ) -> tuple[list[ResourceResult], int, IndexState]:
197 | """
198 | Get resources from wget directories using in-memory SQLite.
199 |
200 | Args:
201 | datasrc: path to the directory containing wget captures
202 | ids: optional list of resource IDs to filter by
203 | sites: optional list of site IDs to filter by
204 | query: search query string
205 | types: optional list of resource types to filter by
206 | fields: optional list of fields to include in response
207 | statuses: optional list of HTTP status codes to filter by
208 | sort: sort order for results
209 | limit: maximum number of results to return
210 | offset: number of results to skip for pagination
211 |
212 | Returns:
213 | Tuple of (list of ResourceResult objects, total count)
214 | """
215 | sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
216 | assert sites_results, "At least one site is required to search"
217 | site_paths = [site.path for site in sites_results]
218 | sites_group = SitesGroup(datasrc, sites, site_paths)
219 | return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
220 |
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.templates.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>mcp_server_webcrawl.templates package — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="next" title="mcp_server_webcrawl.utils package" href="mcp_server_webcrawl.utils.html" />
23 | <link rel="prev" title="mcp_server_webcrawl.models package" href="mcp_server_webcrawl.models.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
50 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
51 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
52 | <li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
53 | <li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
54 | </ul>
55 | </li>
56 | </ul>
57 |
58 | </div>
59 | </div>
60 | </nav>
61 |
62 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
63 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
64 | <a href="index.html">mcp-server-webcrawl</a>
65 | </nav>
66 |
67 | <div class="wy-nav-content">
68 | <div class="rst-content">
69 | <div role="navigation" aria-label="Page navigation">
70 | <ul class="wy-breadcrumbs">
71 | <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
72 | <li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
73 | <li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
74 | <li class="breadcrumb-item active">mcp_server_webcrawl.templates package</li>
75 | <li class="wy-breadcrumbs-aside">
76 | <a href="_sources/mcp_server_webcrawl.templates.rst.txt" rel="nofollow"> View page source</a>
77 | </li>
78 | </ul>
79 | <hr/>
80 | </div>
81 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
82 | <div itemprop="articleBody">
83 |
84 | <section id="mcp-server-webcrawl-templates-package">
85 | <h1>mcp_server_webcrawl.templates package<a class="headerlink" href="#mcp-server-webcrawl-templates-package" title="Link to this heading"></a></h1>
86 | <section id="submodules">
87 | <h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
88 | </section>
89 | <section id="module-mcp_server_webcrawl.templates.tests">
90 | <span id="mcp-server-webcrawl-templates-tests-module"></span><h2>mcp_server_webcrawl.templates.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.templates.tests" title="Link to this heading"></a></h2>
91 | <dl class="py class">
92 | <dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests">
93 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">TemplateTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests" title="Link to this definition"></a></dt>
94 | <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">TestCase</span></code></p>
95 | <p>Test suite for the custom HTML to markdown converter.
96 | Why custom? It’s a bit faster, that is the only reason.
97 | Maximum load is 100 transforms (1 per result for a max result
98 | of 100), so speed matters. A default set is 20.
99 | This converter does a few things differently to tailor to LLM
100 | interaction.
101 | * aggressively removes images (html2text selectively renders)
102 | * links with block decendents will render like a <p></p>
103 | <blockquote>
104 | <div><p>(html2text treats as <a><br>)</p>
105 | </div></blockquote>
106 | <p>Create an instance of the class that will use the named test
107 | method when executed. Raises a ValueError if the instance does
108 | not have a method with the specified name.</p>
109 | <dl class="py method">
110 | <dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests.setUp">
111 | <span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests.setUp" title="Link to this definition"></a></dt>
112 | <dd><p>Set up the test environment with fixture data.</p>
113 | </dd></dl>
114 |
115 | <dl class="py method">
116 | <dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html">
117 | <span class="sig-name descname"><span class="pre">test_core_html</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests.test_core_html"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html" title="Link to this definition"></a></dt>
118 | <dd></dd></dl>
119 |
120 | </dd></dl>
121 |
122 | </section>
123 | <section id="module-mcp_server_webcrawl.templates">
124 | <span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.templates" title="Link to this heading"></a></h2>
125 | </section>
126 | </section>
127 |
128 |
129 | </div>
130 | </div>
131 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
132 | <a href="mcp_server_webcrawl.models.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.models package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
133 | <a href="mcp_server_webcrawl.utils.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.utils package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
134 | </div>
135 |
136 | <hr/>
137 |
138 | <div role="contentinfo">
139 | <p>© Copyright 2025, pragmar.</p>
140 | </div>
141 |
142 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
143 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
144 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
145 |
146 |
147 | </footer>
148 | </div>
149 | </div>
150 | </section>
151 | </div>
152 | <script>
153 | jQuery(function () {
154 | SphinxRtdTheme.Navigation.enable(true);
155 | });
156 | </script>
157 |
158 | </body>
159 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/index.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>Overview: module code — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../_static/doctools.js?v=888ff710"></script>
17 | <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../genindex.html" />
20 | <link rel="search" title="Search" href="../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item active">Overview: module code</li>
67 | <li class="wy-breadcrumbs-aside">
68 | </li>
69 | </ul>
70 | <hr/>
71 | </div>
72 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
73 | <div itemprop="articleBody">
74 |
75 | <h1>All modules for which code is available</h1>
76 | <ul><li><a href="mcp_server_webcrawl/crawlers.html">mcp_server_webcrawl.crawlers</a></li>
77 | <ul><li><a href="mcp_server_webcrawl/crawlers/archivebox/adapter.html">mcp_server_webcrawl.crawlers.archivebox.adapter</a></li>
78 | <li><a href="mcp_server_webcrawl/crawlers/archivebox/crawler.html">mcp_server_webcrawl.crawlers.archivebox.crawler</a></li>
79 | <li><a href="mcp_server_webcrawl/crawlers/archivebox/tests.html">mcp_server_webcrawl.crawlers.archivebox.tests</a></li>
80 | <li><a href="mcp_server_webcrawl/crawlers/base/adapter.html">mcp_server_webcrawl.crawlers.base.adapter</a></li>
81 | <li><a href="mcp_server_webcrawl/crawlers/base/api.html">mcp_server_webcrawl.crawlers.base.api</a></li>
82 | <li><a href="mcp_server_webcrawl/crawlers/base/crawler.html">mcp_server_webcrawl.crawlers.base.crawler</a></li>
83 | <li><a href="mcp_server_webcrawl/crawlers/base/indexed.html">mcp_server_webcrawl.crawlers.base.indexed</a></li>
84 | <li><a href="mcp_server_webcrawl/crawlers/base/tests.html">mcp_server_webcrawl.crawlers.base.tests</a></li>
85 | <li><a href="mcp_server_webcrawl/crawlers/httrack/adapter.html">mcp_server_webcrawl.crawlers.httrack.adapter</a></li>
86 | <li><a href="mcp_server_webcrawl/crawlers/httrack/crawler.html">mcp_server_webcrawl.crawlers.httrack.crawler</a></li>
87 | <li><a href="mcp_server_webcrawl/crawlers/httrack/tests.html">mcp_server_webcrawl.crawlers.httrack.tests</a></li>
88 | <li><a href="mcp_server_webcrawl/crawlers/interrobot/adapter.html">mcp_server_webcrawl.crawlers.interrobot.adapter</a></li>
89 | <li><a href="mcp_server_webcrawl/crawlers/interrobot/crawler.html">mcp_server_webcrawl.crawlers.interrobot.crawler</a></li>
90 | <li><a href="mcp_server_webcrawl/crawlers/interrobot/tests.html">mcp_server_webcrawl.crawlers.interrobot.tests</a></li>
91 | <li><a href="mcp_server_webcrawl/crawlers/katana/adapter.html">mcp_server_webcrawl.crawlers.katana.adapter</a></li>
92 | <li><a href="mcp_server_webcrawl/crawlers/katana/crawler.html">mcp_server_webcrawl.crawlers.katana.crawler</a></li>
93 | <li><a href="mcp_server_webcrawl/crawlers/katana/tests.html">mcp_server_webcrawl.crawlers.katana.tests</a></li>
94 | <li><a href="mcp_server_webcrawl/crawlers/siteone/adapter.html">mcp_server_webcrawl.crawlers.siteone.adapter</a></li>
95 | <li><a href="mcp_server_webcrawl/crawlers/siteone/crawler.html">mcp_server_webcrawl.crawlers.siteone.crawler</a></li>
96 | <li><a href="mcp_server_webcrawl/crawlers/siteone/tests.html">mcp_server_webcrawl.crawlers.siteone.tests</a></li>
97 | <li><a href="mcp_server_webcrawl/crawlers/warc/adapter.html">mcp_server_webcrawl.crawlers.warc.adapter</a></li>
98 | <li><a href="mcp_server_webcrawl/crawlers/warc/crawler.html">mcp_server_webcrawl.crawlers.warc.crawler</a></li>
99 | <li><a href="mcp_server_webcrawl/crawlers/warc/tests.html">mcp_server_webcrawl.crawlers.warc.tests</a></li>
100 | <li><a href="mcp_server_webcrawl/crawlers/wget/adapter.html">mcp_server_webcrawl.crawlers.wget.adapter</a></li>
101 | <li><a href="mcp_server_webcrawl/crawlers/wget/crawler.html">mcp_server_webcrawl.crawlers.wget.crawler</a></li>
102 | <li><a href="mcp_server_webcrawl/crawlers/wget/tests.html">mcp_server_webcrawl.crawlers.wget.tests</a></li>
103 | </ul><li><a href="mcp_server_webcrawl/extras/markdown.html">mcp_server_webcrawl.extras.markdown</a></li>
104 | <li><a href="mcp_server_webcrawl/extras/regex.html">mcp_server_webcrawl.extras.regex</a></li>
105 | <li><a href="mcp_server_webcrawl/extras/snippets.html">mcp_server_webcrawl.extras.snippets</a></li>
106 | <li><a href="mcp_server_webcrawl/extras/thumbnails.html">mcp_server_webcrawl.extras.thumbnails</a></li>
107 | <li><a href="mcp_server_webcrawl/extras/xpath.html">mcp_server_webcrawl.extras.xpath</a></li>
108 | <li><a href="mcp_server_webcrawl/interactive/highlights.html">mcp_server_webcrawl.interactive.highlights</a></li>
109 | <li><a href="mcp_server_webcrawl/interactive/search.html">mcp_server_webcrawl.interactive.search</a></li>
110 | <li><a href="mcp_server_webcrawl/interactive/session.html">mcp_server_webcrawl.interactive.session</a></li>
111 | <li><a href="mcp_server_webcrawl/interactive/ui.html">mcp_server_webcrawl.interactive.ui</a></li>
112 | <li><a href="mcp_server_webcrawl/main.html">mcp_server_webcrawl.main</a></li>
113 | <li><a href="mcp_server_webcrawl/models/resources.html">mcp_server_webcrawl.models.resources</a></li>
114 | <li><a href="mcp_server_webcrawl/models/sites.html">mcp_server_webcrawl.models.sites</a></li>
115 | <li><a href="mcp_server_webcrawl/templates/tests.html">mcp_server_webcrawl.templates.tests</a></li>
116 | <li><a href="mcp_server_webcrawl/utils.html">mcp_server_webcrawl.utils</a></li>
117 | <ul><li><a href="mcp_server_webcrawl/utils/cli.html">mcp_server_webcrawl.utils.cli</a></li>
118 | <li><a href="mcp_server_webcrawl/utils/logger.html">mcp_server_webcrawl.utils.logger</a></li>
119 | <li><a href="mcp_server_webcrawl/utils/server.html">mcp_server_webcrawl.utils.server</a></li>
120 | <li><a href="mcp_server_webcrawl/utils/tools.html">mcp_server_webcrawl.utils.tools</a></li>
121 | </ul><li><a href="namedtuple_InputRadioState.html">namedtuple_InputRadioState</a></li>
122 | </ul>
123 |
124 | </div>
125 | </div>
126 | <footer>
127 |
128 | <hr/>
129 |
130 | <div role="contentinfo">
131 | <p>© Copyright 2025, pragmar.</p>
132 | </div>
133 |
134 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
135 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
136 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
137 |
138 |
139 | </footer>
140 | </div>
141 | </div>
142 | </section>
143 | </div>
144 | <script>
145 | jQuery(function () {
146 | SphinxRtdTheme.Navigation.enable(true);
147 | });
148 | </script>
149 |
150 | </body>
151 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/tests.py:
--------------------------------------------------------------------------------
```python
1 | from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler
2 | from mcp_server_webcrawl.crawlers.httrack.adapter import HtTrackManager
3 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
4 | from mcp_server_webcrawl.crawlers import get_fixture_directory
5 | from mcp_server_webcrawl.utils.logger import get_logger
6 |
7 | logger = get_logger()
8 |
9 | # Calculate using same hash function as adapter
10 | EXAMPLE_SITE_ID = HtTrackManager.string_to_id("example")
11 | PRAGMAR_SITE_ID = HtTrackManager.string_to_id("pragmar")
12 |
13 | class HtTrackTests(BaseCrawlerTests):
14 | """
15 | Test suite for the HTTrack crawler implementation.
16 | Uses all wrapped test methods from BaseCrawlerTests plus HTTrack-specific features.
17 | """
18 |
19 | def setUp(self):
20 | """
21 | Set up the test environment with fixture data.
22 | """
23 | super().setUp()
24 | self._datasrc = get_fixture_directory() / "httrack"
25 |
26 | def test_httrack_pulse(self):
27 | """
28 | Test basic crawler initialization.
29 | """
30 | crawler = HtTrackCrawler(self._datasrc)
31 | self.assertIsNotNone(crawler)
32 | self.assertTrue(self._datasrc.is_dir())
33 |
34 | def test_httrack_sites(self):
35 | """
36 | Test site retrieval API functionality.
37 | """
38 | crawler = HtTrackCrawler(self._datasrc)
39 | self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
40 |
41 | def test_httrack_search(self):
42 | """
43 | Test boolean search functionality
44 | """
45 | crawler = HtTrackCrawler(self._datasrc)
46 | self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
47 | pass
48 |
49 | def test_httrack_resources(self):
50 | """
51 | Test resource retrieval API functionality with various arguments.
52 | """
53 | crawler = HtTrackCrawler(self._datasrc)
54 | self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
55 |
56 | def test_httrack_images(self):
57 | """
58 | Test HTTrack image handling and thumbnails.
59 | """
60 | crawler = HtTrackCrawler(self._datasrc)
61 | self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
62 |
63 | def test_httrack_sorts(self):
64 | """
65 | Test random sort functionality using the sort argument.
66 | """
67 | crawler = HtTrackCrawler(self._datasrc)
68 | self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
69 |
70 | def test_httrack_content_parsing(self):
71 | """
72 | Test content type detection and parsing.
73 | """
74 | crawler = HtTrackCrawler(self._datasrc)
75 | self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
76 |
77 | def test_httrack_tokenizer(self):
78 | """
79 | Test HTTrack-specific tokenizer functionality for hyphenated terms.
80 | """
81 | crawler = HtTrackCrawler(self._datasrc)
82 | self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
83 |
84 | def test_httrack_log_parsing_features(self):
85 | """
86 | Test HTTrack-specific features related to hts-log.txt parsing.
87 | """
88 | crawler = HtTrackCrawler(self._datasrc)
89 |
90 | # Test that 404 errors from log are properly indexed
91 | error_resources = crawler.get_resources_api(
92 | sites=[PRAGMAR_SITE_ID],
93 | query="status: 404"
94 | )
95 | if error_resources.total > 0:
96 | for resource in error_resources._results:
97 | self.assertEqual(resource.status, 404, "404 status should be preserved from log parsing")
98 |
99 | # Test that redirects are properly indexed
100 | redirect_resources = crawler.get_resources_api(
101 | sites=[PRAGMAR_SITE_ID],
102 | query="status: 302"
103 | )
104 | if redirect_resources.total > 0:
105 | for resource in redirect_resources._results:
106 | self.assertEqual(resource.status, 302, "Redirect status should be detected from log")
107 |
108 | # Test successful resources default to 200
109 | success_resources = crawler.get_resources_api(
110 | sites=[PRAGMAR_SITE_ID],
111 | query="status: 200",
112 | limit=5
113 | )
114 | self.assertTrue(success_resources.total > 0, "Should have successful resources with status 200")
115 | for resource in success_resources._results:
116 | self.assertEqual(resource.status, 200)
117 |
118 | def test_httrack_url_reconstruction(self):
119 | """
120 | Test HTTrack URL reconstruction from project and domain structure.
121 | """
122 | crawler = HtTrackCrawler(self._datasrc)
123 |
124 | # Get all resources to test URL patterns
125 | all_resources = crawler.get_resources_api(
126 | sites=[PRAGMAR_SITE_ID],
127 | limit=10
128 | )
129 | self.assertTrue(all_resources.total > 0, "Should have resources with reconstructed URLs")
130 |
131 | for resource in all_resources._results:
132 | # URLs should be properly formatted
133 | self.assertTrue(resource.url.startswith("https://"),
134 | f"URL should start with https://: {resource.url}")
135 |
136 | # URLs should not contain file system artifacts
137 | self.assertNotIn("\\", resource.url, "URLs should not contain backslashes")
138 | self.assertNotIn("hts-", resource.url, "URLs should not contain HTTrack artifacts")
139 |
140 | def test_httrack_domain_detection(self):
141 | """
142 | Test HTTrack domain directory detection and multi-domain handling.
143 | """
144 | crawler = HtTrackCrawler(self._datasrc)
145 | sites_result = crawler.get_sites_api()
146 | self.assertTrue(sites_result.total > 0, "Should detect HTTrack project directories as sites")
147 |
148 | specific_site = crawler.get_sites_api(ids=[PRAGMAR_SITE_ID])
149 | if specific_site.total > 0:
150 | site_data = specific_site._results[0].to_dict()
151 | self.assertIn("urls", site_data, "Site should have URLs")
152 | self.assertTrue(len(site_data["urls"]) > 0, "Site should have at least one valid URL")
153 |
154 | def test_httrack_file_exclusion(self):
155 | """
156 | Test that HTTrack-generated files are properly excluded.
157 | """
158 | crawler = HtTrackCrawler(self._datasrc)
159 |
160 | # Search for any resources that might be HTTrack artifacts
161 | all_resources = crawler.get_resources_api(
162 | sites=[PRAGMAR_SITE_ID],
163 | query="",
164 | limit=50
165 | )
166 |
167 | for resource in all_resources._results:
168 | # Should not find project-level index.html (HTTrack-generated)
169 | if resource.url.endswith("/index.html"):
170 | # This should be domain-level index.html, not project-level
171 | self.assertNotEqual(resource.url, "https://pragmar/index.html",
172 | "Should not index project-level HTTrack-generated index.html")
173 |
174 | # Should not find hts-log.txt as a resource
175 | self.assertNotIn("hts-log.txt", resource.url, "Should not index hts-log.txt as resource")
176 | self.assertNotIn("hts-cache", resource.url, "Should not index hts-cache contents as resources")
177 |
178 | def test_httrack_advanced_features(self):
179 | """
180 | Test HTTrack-specific advanced features not covered by base tests.
181 | """
182 | crawler = HtTrackCrawler(self._datasrc)
183 |
184 | # Test field retrieval with HTTrack-specific metadata
185 | field_resources = crawler.get_resources_api(
186 | sites=[PRAGMAR_SITE_ID],
187 | query="type: html",
188 | fields=["content", "headers", "created", "modified"],
189 | limit=3
190 | )
191 |
192 | if field_resources.total > 0:
193 | resource_dict = field_resources._results[0].to_dict()
194 |
195 | # Test timestamps from file system
196 | self.assertIn("created", resource_dict, "Should have created timestamp from file stat")
197 | self.assertIn("modified", resource_dict, "Should have modified timestamp from file stat")
198 |
199 | # Test headers generation
200 | if "headers" in resource_dict and resource_dict["headers"]:
201 | headers = resource_dict["headers"]
202 | self.assertIn("Content-Type:", headers, "Should have generated Content-Type header")
203 | self.assertIn("Content-Length:", headers, "Should have generated Content-Length header")
204 |
205 | # Test that resources have proper size information
206 | size_resources = crawler.get_resources_api(
207 | sites=[PRAGMAR_SITE_ID],
208 | fields=["size"],
209 | limit=5
210 | )
211 |
212 | if size_resources.total > 0:
213 | for resource in size_resources._results:
214 | resource_dict = resource.to_dict()
215 | self.assertIn("size", resource_dict, "Resource should have size field")
216 | self.assertGreaterEqual(resource_dict["size"], 0, "Size should be non-negative")
217 |
218 | def test_report(self):
219 | """
220 | Run test report, save to data directory.
221 | """
222 | crawler = HtTrackCrawler(self._datasrc)
223 | logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "HTTrack"))
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/adapter.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 | import sqlite3
3 | import traceback
4 |
5 | from contextlib import closing
6 | from logging import Logger
7 | from pathlib import Path
8 | from typing import Final
9 | from urllib.parse import urlparse
10 |
11 | from mcp_server_webcrawl.crawlers.base.adapter import IndexState, IndexStatus, BaseManager, SitesGroup
12 | from mcp_server_webcrawl.models.resources import ResourceResult, RESOURCES_LIMIT_DEFAULT
13 | from mcp_server_webcrawl.models.sites import SiteResult, SiteType
14 | from mcp_server_webcrawl.utils import from_isoformat_zulu
15 | from mcp_server_webcrawl.utils.logger import get_logger
16 |
17 | # maybe dedupe with near match RESOURCES version
18 | INTERROBOT_RESOURCE_FIELD_MAPPING: Final[dict[str, str]] = {
19 | "id": "ResourcesFullText.Id",
20 | "site": "ResourcesFullText.Project",
21 | "created": "Resources.Created",
22 | "modified": "Resources.Modified",
23 | "url": "ResourcesFullText.Url",
24 | "status": "ResourcesFullText.Status",
25 | "size": "Resources.Size",
26 | "type": "ResourcesFullText.Type",
27 | "headers": "ResourcesFullText.Headers",
28 | "content": "ResourcesFullText.Content",
29 | "time": "ResourcesFullText.Time"
30 | }
31 |
32 | INTERROBOT_SITE_FIELD_REQUIRED: Final[set[str]] = set(["id", "name", "type", "urls"])
33 |
34 | # legit different from default version (extra robots)
35 | INTERROBOT_SITE_FIELD_MAPPING: Final[dict[str, str]] = {
36 | "id": "Project.Id",
37 | "name": "Project.Name",
38 | "type": "Project.Type",
39 | "urls": "Project.Urls",
40 | "created": "Project.Created",
41 | "modified": "Project.Modified",
42 | }
43 |
44 | logger: Logger = get_logger()
45 |
46 | class InterroBotManager(BaseManager):
47 | """
48 | Manages HTTP text files in in-memory SQLite databases.
49 | Provides connection pooling and caching for efficient access.
50 | """
51 |
52 | def __init__(self) -> None:
53 | """Initialize the HTTP text manager with empty cache and statistics."""
54 | super().__init__()
55 |
56 | def get_connection(self, group: SitesGroup) -> tuple[sqlite3.Connection | None, IndexState]:
57 | """
58 | Get database connection for sites in the group, creating if needed.
59 |
60 | Args:
61 | group: Group of sites to connect to
62 |
63 | Returns:
64 | Tuple of (SQLite connection to in-memory database with data loaded or None if building,
65 | IndexState associated with this database)
66 | """
67 |
68 | index_state = IndexState()
69 | index_state.set_status(IndexStatus.REMOTE)
70 | connection: sqlite3.Connection
71 | try:
72 | # note, responsible for implementing closing() on other side
73 | connection = sqlite3.connect(group.datasrc)
74 | except sqlite3.Error as ex:
75 | logger.error(f"SQLite error reading database: {ex}\n{traceback.format_exc()}")
76 | except (FileNotFoundError, PermissionError) as ex:
77 | logger.error(f"Database access error: {group.datasrc}\n{traceback.format_exc()}")
78 | raise
79 | except Exception as ex:
80 | logger.error(f"Unexpected error reading database {group.datasrc}: {ex}\n{traceback.format_exc()}")
81 | raise
82 |
83 | return connection, index_state
84 |
85 | manager: InterroBotManager = InterroBotManager()
86 |
87 | def get_sites(datasrc: Path, ids=None, fields=None) -> list[SiteResult]:
88 | """
89 | Get sites based on the provided parameters.
90 |
91 | Args:
92 | datasrc: path to the database
93 | ids: optional list of site IDs
94 | fields: list of fields to include in response
95 |
96 | Returns:
97 | List of SiteResult objects
98 | """
99 | site_fields_required: list[str] = ["id", "name", "type", "urls"]
100 | site_fields_default: list[str] = site_fields_required + ["created", "modified"]
101 | site_fields_available: list[str] = list(INTERROBOT_SITE_FIELD_MAPPING.keys())
102 |
103 | # build query
104 | params: dict[str, int | str] = {}
105 |
106 | # these inputs are named parameters
107 | ids_clause: str = ""
108 | if ids and isinstance(ids, list) and len(ids) > 0:
109 | placeholders: list[str] = [f":id{i}" for i in range(len(ids))]
110 | ids_clause: str = f" WHERE Project.Id IN ({','.join(placeholders)})"
111 | params.update({f"id{i}": id_val for i, id_val in enumerate(ids)})
112 |
113 | # these inputs are not parameterized
114 | # fields will be returned from database, if found in INTERROBOT_SITE_FIELD_MAPPING
115 | selected_fields = set(site_fields_required)
116 | if fields and isinstance(fields, list):
117 | selected_fields.update(f for f in fields if f in site_fields_available)
118 | else:
119 | selected_fields.update(site_fields_default)
120 |
121 | safe_sql_fields = [INTERROBOT_SITE_FIELD_MAPPING[f] for f in selected_fields]
122 | assert all(re.match(r"^[A-Za-z\.]+$", field) for field in safe_sql_fields), "Unknown or unsafe field requested"
123 | safe_sql_fields_joined: str = ", ".join(safe_sql_fields)
124 |
125 | statement: str = f"SELECT {safe_sql_fields_joined} FROM Projects AS Project{ids_clause} ORDER BY Project.Name ASC"
126 | sql_results: list[dict[str, int | str | None]] = []
127 | try:
128 | if not statement.strip().upper().startswith("SELECT"):
129 | logger.error("Unauthorized SQL statement")
130 | raise ValueError("Only SELECT queries are permitted")
131 |
132 | with closing(sqlite3.connect(datasrc)) as conn:
133 | conn.row_factory = sqlite3.Row
134 | with closing(conn.cursor()) as cursor:
135 | cursor.execute(statement, params or {})
136 | sql_results = [{k.lower(): v for k, v in dict(row).items()} for row in cursor.fetchall()]
137 | except sqlite3.Error as ex:
138 | logger.error(f"SQLite error reading database: {ex}\n{traceback.format_exc()}")
139 | return []
140 | except Exception as ex:
141 | logger.error(f"Database error: {ex}")
142 | return []
143 |
144 | results: list[SiteResult] = []
145 | #for row in sql_results:
146 | # results.append(SiteResult(
147 | # path=datasrc,
148 | # id=row.get("id"),
149 | # url=row.get("url", ""),
150 | # created=from_isoformat_zulu(row.get("created")),
151 | # modified=from_isoformat_zulu(row.get("modified")),
152 | # robots=row.get("robotstext"),
153 | # metadata=None,
154 | # ))
155 |
156 | for row in sql_results:
157 | urls_list = __urls_from_text(row.get("urls", ""))
158 | site_type: SiteType
159 | db_type = row.get("type")
160 | if db_type == 1:
161 | site_type = SiteType.CRAWLED_URL
162 | elif db_type == 2:
163 | site_type = SiteType.CRAWLED_LIST
164 | else:
165 | site_type = SiteType.UNDEFINED
166 |
167 | results.append(SiteResult(
168 | path=datasrc,
169 | id=row.get("id"),
170 | name=row.get("name"), # NEW: directly from DB
171 | type=site_type, # NEW: from DB (needs mapping)
172 | urls=urls_list, # CHANGED: split into list
173 | created=from_isoformat_zulu(row.get("created")),
174 | modified=from_isoformat_zulu(row.get("modified")),
175 | robots=None, # Removed - not in new model
176 | metadata=None,
177 | ))
178 |
179 | return results
180 |
181 | def __urls_from_text(urls: str) -> list[str]:
182 | urls_list = []
183 | if urls:
184 | for url in urls.split('\n'):
185 | url = url.strip()
186 | if url:
187 | try:
188 | parsed = urlparse(url)
189 | if parsed.scheme:
190 | urls_list.append(url)
191 | except Exception:
192 | continue
193 | return urls_list
194 |
195 | def get_resources(
196 | datasrc: Path,
197 | sites: list[int] | None = None,
198 | query: str = "",
199 | fields: list[str] | None = None,
200 | sort: str | None = None,
201 | limit: int = RESOURCES_LIMIT_DEFAULT,
202 | offset: int = 0,
203 | ) -> tuple[list[ResourceResult], int, IndexState]:
204 | """
205 | Get resources from wget directories using in-memory SQLite.
206 |
207 | Args:
208 | datasrc: path to the directory containing wget captures
209 | sites: optional list of site IDs to filter by
210 | query: search query string
211 | fields: optional list of fields to include in response
212 | sort: sort order for results
213 | limit: maximum number of results to return
214 | offset: number of results to skip for pagination
215 |
216 | Returns:
217 | Tuple of (list of ResourceResult objects, total count)
218 | """
219 | sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
220 | assert sites_results, "At least one site is required to search"
221 | site_paths = [site.path for site in sites_results]
222 | sites_group = SitesGroup(datasrc, sites, site_paths)
223 |
224 | # InterroBot uses ints in place of strings
225 | swap_values = {
226 | "type" : {
227 | "": 0, # UNDEFINED
228 | "html": 1, # PAGE
229 | "other": 2, # OTHER (could also be 5 or 12 depending on context)
230 | "rss": 3, # FEED
231 | "iframe": 4, # FRAME
232 | "img": 6, # IMAGE
233 | "audio": 7, # AUDIO
234 | "video": 8, # VIDEO
235 | "font": 9, # FONT
236 | "style": 10, # CSS
237 | "script": 11, # SCRIPT
238 | "text": 13, # TEXT
239 | "pdf": 14, # PDF
240 | "doc": 15 # DOC
241 | }
242 | }
243 | return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset, swap_values)
244 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/snippets.py:
--------------------------------------------------------------------------------
```python
1 |
2 | import re
3 | import lxml.html
4 |
5 | from lxml import etree
6 | from lxml.etree import ParserError
7 | from logging import Logger
8 | from typing import Final
9 |
10 | from mcp_server_webcrawl.utils.logger import get_logger
11 | from mcp_server_webcrawl.utils.search import SearchQueryParser
12 |
13 | MAX_SNIPPETS_MATCHED_COUNT: Final[int] = 15
14 | MAX_SNIPPETS_RETURNED_COUNT: Final[int] = 3
15 | MAX_SNIPPETS_CONTEXT_SIZE: Final[int] = 48
16 |
17 | __RE_SNIPPET_START_TRIM: Final[re.Pattern] = re.compile(r"^[^\w\[]+")
18 | __RE_SNIPPET_END_TRIM: Final[re.Pattern] = re.compile(r"[^\w\]]+$")
19 |
20 | logger: Logger = get_logger()
21 |
22 | class SnippetContentExtractor:
23 | """
24 | lxml-based HTML parser for extracting different types of content from HTML.
25 | Content separates into components: text, markup, attributes (values), and comments.
26 | These can be prioritized in search so that text is the displayed hit over noisier
27 | types.
28 | """
29 | PRIORITY_ORDER: list[str] = ["url", "document_text", "document_attributes",
30 | "document_comments", "headers", "document_markup"]
31 |
32 | __RE_SPLIT: re.Pattern = re.compile(r"[\s_]+|(?<!\w)-(?!\w)")
33 | __RE_WHITESPACE: re.Pattern = re.compile(r"\s+")
34 | __MAX_CONTENT_BYTES: int = 2 * 1024 * 1024 # 2MB
35 |
36 | def __init__(self, url: str, headers: str, content: str):
37 |
38 | self.__document: lxml.html.HtmlElement | None = None
39 |
40 | self.url: str = url
41 | self.content: str = ""
42 | # headers one liner to facilitate snippet
43 | self.headers: str = re.sub(r"\s+", " ", headers).strip()
44 | self.document_text: str = ""
45 | self.document_markup: str = ""
46 | self.document_attributes: str = ""
47 | self.document_comments: str = ""
48 |
49 | if len(content) > self.__MAX_CONTENT_BYTES:
50 | # ignore large files, slow
51 | return
52 | else:
53 | self.content = content
54 |
55 | load_success: bool = self.__load_content()
56 | if load_success == True:
57 | _ = self.__extract()
58 | else:
59 | self.document_text = self.__normalize_whitespace(self.content)
60 |
61 | def __load_content(self) -> bool:
62 | """
63 | Load content string into lxml doc.
64 | """
65 |
66 | if not self.content or not self.content.strip():
67 | return False
68 |
69 | try:
70 | self.__document = lxml.html.fromstring(self.content.encode("utf-8"))
71 | return True
72 | except (ParserError, ValueError, UnicodeDecodeError):
73 | try:
74 | wrapped_content = f"<html><body>{self.content}</body></html>"
75 | self.__document = lxml.html.fromstring(wrapped_content.encode("utf-8"))
76 | return True
77 | except (ParserError, ValueError, UnicodeDecodeError):
78 | return False
79 |
80 | def __extract(self) -> bool:
81 | """
82 | Extract content from lxml doc.
83 | """
84 |
85 | if self.__document is None:
86 | return False
87 |
88 | text_values = []
89 | markup_values = []
90 | attribute_values = []
91 | comment_values = []
92 |
93 | element: lxml.html.HtmlElement | None = None
94 | for element in self.__document.iter():
95 |
96 | # HTML outliers
97 | if element.tag is etree.Comment or element.tag is etree.ProcessingInstruction:
98 | if element.text is not None:
99 | comment_values.append(str(element.text.strip()))
100 | # avoid regular element text processing
101 | continue
102 |
103 | if element.tag is etree.Entity or element.tag is etree.CDATA:
104 | if element.text is not None:
105 | text_values.append(str(element.text.strip()))
106 | continue
107 |
108 | # HTML tags and attributes
109 | if element.tag:
110 | markup_values.append(element.tag)
111 | if element.tag in ("script", "style"):
112 | continue
113 |
114 | if element.text:
115 | text_values.append(element.text.strip())
116 |
117 | if element.tail:
118 | text_values.append(element.tail.strip())
119 |
120 | for attr_name, attr_value in element.attrib.items():
121 | markup_values.append(attr_name)
122 | if attr_value:
123 | values = [v for v in self.__RE_SPLIT.split(attr_value) if v]
124 | attribute_values.extend(values)
125 |
126 | self.document_text = self.__normalize_values(text_values)
127 | self.document_markup = self.__normalize_values(markup_values)
128 | self.document_attributes = self.__normalize_values(attribute_values)
129 | self.document_comments = self.__normalize_values(comment_values)
130 |
131 | return True
132 |
133 | def __normalize_values(self, values: list[str]) -> str:
134 | """
135 | Concatenate values and normalize whitespace for list of values.
136 | """
137 | text = " ".join([value for value in values if value])
138 | return self.__normalize_whitespace(text)
139 |
140 | def __normalize_whitespace(self, text: str) -> str:
141 | """
142 | Normalize whitespace using pre-compiled pattern.
143 | """
144 | return self.__RE_WHITESPACE.sub(" ", text).strip()
145 |
146 | def get_snippets(url: str, headers: str, content: str, query: str) -> str | None:
147 | """
148 | Takes a query and content, reduces the HTML to text content and extracts hits
149 | as excerpts of text.
150 |
151 | Arguments:
152 | headers: Header content to search
153 | content: The HTML or text content to search in
154 | query: The search query string
155 |
156 | Returns:
157 | A string of snippets with context around matched terms, separated by " ... " or None
158 | """
159 | if query in (None, ""):
160 | return None
161 |
162 | url = url or ""
163 | content = content or ""
164 | headers = headers or ""
165 |
166 | search_terms_parser = SearchQueryParser()
167 | search_terms: list[str] = search_terms_parser.get_fulltext_terms(query)
168 |
169 | if not search_terms:
170 | return None
171 |
172 | snippets = []
173 | search_terms_parser = SnippetContentExtractor(url, headers, content)
174 |
175 | # priority order url, text, attributes, comments, headers, markup
176 | # most interesting to least, as search hits
177 | for group_name in search_terms_parser.PRIORITY_ORDER:
178 | search_group_text = getattr(search_terms_parser, group_name)
179 | if not search_group_text:
180 | continue
181 | group_snippets = find_snippets_in_text(search_group_text, search_terms,
182 | max_snippets=MAX_SNIPPETS_MATCHED_COUNT+1, group_name=group_name)
183 | snippets.extend(group_snippets)
184 | if len(snippets) > MAX_SNIPPETS_MATCHED_COUNT:
185 | break
186 |
187 | if snippets:
188 | total_snippets = len(snippets)
189 | displayed_snippets = snippets[:MAX_SNIPPETS_RETURNED_COUNT]
190 | result = " ... ".join(displayed_snippets)
191 |
192 | if total_snippets > MAX_SNIPPETS_MATCHED_COUNT:
193 | result += f" ... + >{MAX_SNIPPETS_MATCHED_COUNT} more"
194 | elif total_snippets > MAX_SNIPPETS_RETURNED_COUNT:
195 | remaining = total_snippets - MAX_SNIPPETS_RETURNED_COUNT
196 | result += f" ... +{remaining} more"
197 |
198 | return result
199 |
200 | return None
201 |
202 | def find_snippets_in_text(
203 | text: str,
204 | terms: list[str],
205 | max_snippets: int = MAX_SNIPPETS_MATCHED_COUNT,
206 | group_name: str = "") -> list[str]:
207 | """
208 | Searches for whole-word matches of the given terms in the text and extracts
209 | surrounding context to create highlighted snippets. Each snippet shows the matched term
210 | in context with markdown-style bold highlighting (**term**).
211 |
212 | Args:
213 | text: The text to search within
214 | terms: List of search terms to find (case-insensitive, whole words only)
215 | max_snippets: Maximum number of snippets to return (default: MAX_SNIPPETS_MATCHED_COUNT)
216 | group_name: Regex group identifier (reserved for future use)
217 |
218 | Returns:
219 | List of unique snippet strings with matched terms highlighted using **bold** markdown.
220 | Each snippet includes surrounding context up to MAX_SNIPPETS_CONTEXT_SIZE characters
221 | on each side of the match. Returns empty list if no matches found or invalid input.
222 | """
223 |
224 | if not text or not terms:
225 | return []
226 |
227 | snippets: list[str] = []
228 | seen_snippets: set[str] = set()
229 | text_lower: str = text.lower()
230 |
231 | escaped_terms = [re.escape(term) for term in terms]
232 | pattern: str = rf"\b({'|'.join(escaped_terms)})\b"
233 | highlight_patterns: list[tuple[re.Pattern, str]] = [
234 | (re.compile(rf"\b({re.escape(term)})\b",
235 | re.IGNORECASE), term) for term in terms
236 | ]
237 |
238 | matches = list(re.finditer(pattern, text_lower))
239 |
240 | for match in matches:
241 | if len(snippets) >= max_snippets:
242 | break
243 |
244 | context_start: int = max(0, match.start() - MAX_SNIPPETS_CONTEXT_SIZE)
245 | context_end: int = min(len(text), match.end() + MAX_SNIPPETS_CONTEXT_SIZE)
246 | if context_start > 0:
247 | while context_start > 0 and text[context_start].isalnum():
248 | context_start -= 1
249 | if context_end < len(text):
250 | while context_end < len(text) and text[context_end].isalnum():
251 | context_end += 1
252 |
253 | snippet: str = text[context_start:context_end].strip()
254 | snippet = __RE_SNIPPET_START_TRIM.sub("", snippet)
255 | snippet = __RE_SNIPPET_END_TRIM.sub("", snippet)
256 | highlighted_snippet: str = snippet
257 |
258 | for pattern, _ in highlight_patterns:
259 | highlighted_snippet = pattern.sub(r"**\1**", highlighted_snippet)
260 |
261 | if highlighted_snippet and highlighted_snippet not in seen_snippets:
262 | seen_snippets.add(highlighted_snippet)
263 | snippets.append(highlighted_snippet)
264 |
265 | return snippets
266 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/base.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 | import curses
3 |
4 | from abc import abstractmethod
5 | from typing import TYPE_CHECKING
6 |
7 | from mcp_server_webcrawl import __name__ as module_name, __version__ as module_version
8 | from mcp_server_webcrawl.interactive.ui import ThemeDefinition, ViewBounds
9 | from mcp_server_webcrawl.models.resources import ResourceResult
10 | from mcp_server_webcrawl.interactive.ui import safe_addstr
11 |
12 | if TYPE_CHECKING:
13 | from mcp_server_webcrawl.interactive.session import InteractiveSession
14 |
15 | REGEX_DISPLAY_URL_CLEAN = re.compile(r"^https?://|/$")
16 | OUTER_WIDTH_RIGHT_MARGIN = 1
17 |
18 | LAYOUT_FOOTER_SEPARATOR = " | "
19 | LAYOUT_FOOTER_SEPARATOR_LENGTH = len(LAYOUT_FOOTER_SEPARATOR)
20 | MIN_TERMINAL_HEIGHT = 8
21 | MIN_TERMINAL_WIDTH = 40
22 | CONTENT_MARGIN = 4
23 |
24 | class BaseCursesView:
25 | """
26 | Base class for all views with common interface.
27 | """
28 |
29 | def __init__(self, session: 'InteractiveSession'):
30 | self.session = session
31 | self.bounds = ViewBounds(x=0, y=0, width=0, height=0)
32 | self._focused = False
33 | self._selected_index: int = 0
34 |
35 | @property
36 | def focused(self) -> bool:
37 | return self._focused
38 |
39 | def set_bounds(self, bounds: ViewBounds):
40 | """
41 | Set the rendering bounds for this view.
42 |
43 | Args:
44 | bounds: The ViewBounds object defining the drawing area
45 | """
46 | self.bounds = bounds
47 |
48 | def set_focused(self, focused: bool):
49 | """
50 | Set the focus state for this view.
51 |
52 | Args:
53 | focused: True if this view should be focused, False otherwise
54 | """
55 | self._focused = focused
56 |
57 | @abstractmethod
58 | def render(self, stdscr: curses.window) -> None:
59 | """
60 | Render the view within its bounds.
61 |
62 | Args:
63 | stdscr: The curses window to render on
64 | """
65 | pass
66 |
67 | @abstractmethod
68 | def handle_input(self, key: int) -> bool:
69 | """
70 | Handle input. Return True if consumed, False to pass through.
71 |
72 | Args:
73 | key: The input key code
74 |
75 | Returns:
76 | bool: True if input was consumed, False to pass through
77 | """
78 | pass
79 |
80 | def focusable(self) -> bool:
81 | """
82 | Return True if this view can receive focus.
83 |
84 | Returns:
85 | bool: True if this view can receive focus
86 | """
87 | return True
88 |
89 | def draw_outer_footer(self, stdscr: curses.window, text: str) -> None:
90 | """
91 | Draw context-sensitive help footer with pipe-separated items.
92 |
93 | Args:
94 | stdscr: The curses window to draw on
95 | text: The footer text to display (pipe-separated items)
96 | """
97 | height, width = stdscr.getmaxyx()
98 | footer_line: int = height - 1
99 | footer_line_text: str = BaseCursesView._get_full_width_line(stdscr)
100 | outer_theme_pair: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
101 |
102 | safe_addstr(stdscr, footer_line, 0, footer_line_text, outer_theme_pair)
103 | items = [item.strip() for item in text.split(LAYOUT_FOOTER_SEPARATOR)]
104 | available_width = width - 4 - 2 # 4 for right margin, 2 for left padding
105 |
106 | display_text: str = ""
107 | test_text: str = ""
108 | test_text_length: int = 0
109 | for i in range(len(items)):
110 | test_text = LAYOUT_FOOTER_SEPARATOR.join(items[:i+1])
111 | test_text_length = len(test_text)
112 | if test_text_length <= available_width:
113 | display_text = test_text
114 | else:
115 | break
116 |
117 | # doesn't fit indicator
118 | display_text_length: int = len(display_text)
119 | if test_text_length > available_width:
120 | display_text += f"{(width - display_text_length - 5) * ' '} »"
121 |
122 | if display_text:
123 | outer_header_theme_pair: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
124 | safe_addstr(stdscr, footer_line, 1, display_text, outer_header_theme_pair)
125 |
126 | def draw_outer_header(self, stdscr: curses.window) -> None:
127 | """
128 | Draw the inner header for this view section.
129 |
130 | Args:
131 | stdscr: The curses window to draw on
132 | """
133 | _, width = stdscr.getmaxyx()
134 | style: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
135 |
136 | full_width_line: str = BaseCursesView._get_full_width_line(stdscr)
137 | header_label_text: str = f"{module_name} --interactive"
138 | header_version_text: str = f"v{module_version}"
139 | header_version_x: int = max(0, width - len(header_version_text) - 2)
140 |
141 | safe_addstr(stdscr, 0, 0, full_width_line, style)
142 | if len(header_label_text) < width - 2:
143 | safe_addstr(stdscr, 0, 1, header_label_text, style)
144 |
145 | if header_version_x > len(header_label_text) + 3:
146 | safe_addstr(stdscr, 0, header_version_x, header_version_text, style)
147 |
148 | def draw_inner_footer(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
149 | """
150 | Draw context-sensitive help footer.
151 |
152 | Args:
153 | stdscr: The curses window to draw on
154 | bounds: The view bounds defining the drawing area
155 | text: The footer text to display
156 | """
157 | footer_y: int = bounds.y + bounds.height - 1
158 | line_of_whitespace: str = self._get_bounded_line()
159 | display_text: str = text or ""
160 | display_text_max: int = len(line_of_whitespace) - 2
161 | if len(display_text) > display_text_max:
162 | display_text = f"{display_text[:display_text_max - 1]}…"
163 |
164 | line: str = f" {display_text}".ljust(len(line_of_whitespace))
165 | safe_addstr(stdscr, footer_y, bounds.x, line, self._get_inner_header_style())
166 |
167 | def draw_inner_header(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
168 | """
169 | Draw the application header with module name and version.
170 |
171 | Args:
172 | stdscr: The curses window to draw on
173 | bounds: The view bounds defining the drawing area
174 | text: The header text to display
175 | """
176 |
177 | line_of_whitespace: str = self._get_bounded_line()
178 | display_text: str = text or ""
179 | max_text_width: int = len(line_of_whitespace) - 2
180 | if len(display_text) > max_text_width:
181 | display_text = f"{display_text[:max_text_width - 1]}…"
182 |
183 | line: str = f" {display_text}".ljust(len(line_of_whitespace))
184 | safe_addstr(stdscr, bounds.y, bounds.x, line, self._get_inner_header_style())
185 |
186 |
187 | @staticmethod
188 | def _get_full_width_line(stdscr: curses.window) -> str:
189 | """
190 | Get a line that fills the terminal width.
191 |
192 | Args:
193 | stdscr: The curses window to get dimensions from
194 |
195 | Returns:
196 | str: A string of spaces filling the terminal width
197 | """
198 | _, width = stdscr.getmaxyx()
199 | return " " * (width - OUTER_WIDTH_RIGHT_MARGIN)
200 |
201 | @staticmethod
202 | def url_for_display(url: str) -> str:
203 | """
204 | Remove protocol prefix and trailing slash from URL for display.
205 |
206 | Args:
207 | url: The URL to clean for display
208 |
209 | Returns:
210 | str: The cleaned URL without protocol and trailing slash
211 | """
212 | return REGEX_DISPLAY_URL_CLEAN.sub("", url)
213 |
214 | @staticmethod
215 | def humanized_bytes(result: ResourceResult) -> str:
216 | """
217 | Convert resource size to human-readable format (B, KB, MB).
218 |
219 | Args:
220 | result: The ResourceResult containing size information
221 |
222 | Returns:
223 | str: Human-readable size string (e.g., "1.5MB", "512KB", "128B")
224 | """
225 | display: str = ""
226 | if result is not None:
227 | size: int = result.size
228 | if isinstance(size, int):
229 | if size >= 1024 * 1024:
230 | display = f"{size/(1024*1024):.1f}MB"
231 | elif size >= 1024:
232 | display = f"{size/1024:.1f}KB"
233 | else:
234 | display = f"{size}B"
235 | return display
236 |
237 | def _get_inner_header_style(self) -> int:
238 | """
239 | Get the appropriate header style based on focus state.
240 |
241 | Returns:
242 | int: The theme color pair for the header
243 | """
244 | if self._focused == True:
245 | return self.session.get_theme_color_pair(ThemeDefinition.HEADER_ACTIVE)
246 | else:
247 | return self.session.get_theme_color_pair(ThemeDefinition.HEADER_INACTIVE)
248 |
249 | def _get_input_style(self) -> int:
250 | """
251 | Get the appropriate input style based on focus and selection state.
252 |
253 | Returns:
254 | int: The style attributes for input rendering
255 | """
256 | if self._focused and self._selected_index == 0:
257 | return curses.A_REVERSE
258 | else:
259 | return self.session.get_theme_color_pair(ThemeDefinition.INACTIVE_QUERY)
260 |
261 | def _get_bounded_line(self) -> str:
262 | """
263 | Get a line of spaces that fits within the view bounds.
264 |
265 | Returns:
266 | str: A string of spaces matching the view width
267 | """
268 | return " " * self.bounds.width
269 |
270 | def _renderable(self, stdscr: curses.window) -> bool:
271 | """
272 | Check if the view can be rendered within the current terminal bounds.
273 |
274 | Args:
275 | stdscr: The curses window to check dimensions against
276 |
277 | Returns:
278 | bool: True if the view can be rendered, False otherwise
279 | """
280 | terminal_height, terminal_width = stdscr.getmaxyx()
281 | if self.bounds.y >= terminal_height or self.bounds.x >= terminal_width or self.bounds.width <= 0 or self.bounds.height <= 0:
282 | return False
283 | return True
284 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/tests.py:
--------------------------------------------------------------------------------
```python
1 | from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
2 | from mcp_server_webcrawl.crawlers.archivebox.adapter import ArchiveBoxManager
3 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
4 | from mcp_server_webcrawl.crawlers import get_fixture_directory
5 | from mcp_server_webcrawl.utils.logger import get_logger
6 |
7 | # calculate ids for ArchiveBox working directories using the same hash function as adapter
8 | EXAMPLE_SITE_ID = ArchiveBoxManager.string_to_id("example")
9 | PRAGMAR_SITE_ID = ArchiveBoxManager.string_to_id("pragmar")
10 |
11 | logger = get_logger()
12 |
13 | class ArchiveBoxTests(BaseCrawlerTests):
14 | """
15 | Test suite for the ArchiveBox crawler implementation.
16 | Uses wrapped test methods from BaseCrawlerTests adapted for ArchiveBox's multi-instance structure.
17 | """
18 |
19 | def setUp(self):
20 | """
21 | Set up the test environment with fixture data.
22 | """
23 | super().setUp()
24 | self._datasrc = get_fixture_directory() / "archivebox"
25 |
26 | def test_archivebox_pulse(self):
27 | """
28 | Test basic crawler initialization.
29 | """
30 | crawler = ArchiveBoxCrawler(self._datasrc)
31 | self.assertIsNotNone(crawler)
32 | self.assertTrue(self._datasrc.is_dir())
33 |
34 | def test_archivebox_sites(self):
35 | """
36 | Test site retrieval API functionality.
37 | """
38 | crawler = ArchiveBoxCrawler(self._datasrc)
39 |
40 | # should have multiple sites (example and pragmar working directories)
41 | sites_json = crawler.get_sites_api()
42 | self.assertGreaterEqual(sites_json.total, 2, "ArchiveBox should have multiple working directories as sites")
43 |
44 | # test pragmar site specifically
45 | self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
46 |
47 | def test_archivebox_search(self):
48 | """
49 | Test boolean search functionality.
50 | """
51 | crawler = ArchiveBoxCrawler(self._datasrc)
52 | self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
53 |
54 | def test_pragmar_tokenizer(self):
55 | """
56 | Test tokenizer search functionality.
57 | """
58 | crawler = ArchiveBoxCrawler(self._datasrc)
59 | self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
60 |
61 | def test_archivebox_resources(self):
62 | """
63 | Test resource retrieval API functionality with various parameters.
64 | """
65 | crawler = ArchiveBoxCrawler(self._datasrc)
66 | self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
67 |
68 | def test_interrobot_images(self):
69 | """
70 | Test InterroBot-specific image handling and thumbnails.
71 | """
72 | crawler = ArchiveBoxCrawler(self._datasrc)
73 | self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
74 |
75 | def test_archivebox_sorts(self):
76 | """
77 | Test random sort functionality using the '?' sort parameter.
78 | """
79 | crawler = ArchiveBoxCrawler(self._datasrc)
80 | self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
81 |
82 | def test_archivebox_content_parsing(self):
83 | """
84 | Test content type detection and parsing for ArchiveBox resources.
85 | """
86 | crawler = ArchiveBoxCrawler(self._datasrc)
87 | self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
88 |
89 | def test_archivebox_url_reconstruction(self):
90 | """
91 | Test URL reconstruction from ArchiveBox metadata.
92 | """
93 | crawler = ArchiveBoxCrawler(self._datasrc)
94 |
95 | url_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=20)
96 | self.assertGreater(url_resources.total, 0, "Should have resources with reconstructed URLs")
97 |
98 | for resource in url_resources._results:
99 | # URLs should be valid HTTP/HTTPS (except for archivebox:// fallbacks)
100 | self.assertTrue(
101 | resource.url.startswith(('http://', 'https://', 'archivebox://')),
102 | f"URL should have valid scheme: {resource.url}"
103 | )
104 |
105 | # should not end with index.html (stripped during reconstruction)
106 | self.assertFalse(
107 | resource.url.endswith('/index.html'),
108 | f"URL should not end with index.html: {resource.url}"
109 | )
110 |
111 | def test_archivebox_deduplication(self):
112 | """
113 | Test resource deduplication across timestamped entries.
114 | """
115 | crawler = ArchiveBoxCrawler(self._datasrc)
116 |
117 | # get all resources from pragmar site
118 | all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=100)
119 | self.assertGreater(all_resources.total, 0, "Should have resources")
120 |
121 | # check for URL uniqueness (deduplication should ensure unique URLs)
122 | urls_found = [r.url for r in all_resources._results]
123 | unique_urls = set(urls_found)
124 |
125 | # should have deduplication working (though some URLs might legitimately appear multiple times
126 | # if they're different resources, like different timestamps of the same page)
127 | self.assertLessEqual(len(unique_urls), len(urls_found),
128 | "URL deduplication should work properly")
129 |
130 | def test_archivebox_metadata_parsing(self):
131 | """
132 | Test JSON metadata parsing from ArchiveBox files.
133 | """
134 | crawler = ArchiveBoxCrawler(self._datasrc)
135 |
136 | # get resources with headers from pragmar site
137 | header_resources = crawler.get_resources_api(
138 | sites=[PRAGMAR_SITE_ID],
139 | fields=["headers"],
140 | limit=10
141 | )
142 |
143 | if header_resources.total > 0:
144 | headers_found = 0
145 | for resource in header_resources._results:
146 | resource_dict = resource.to_dict()
147 | if "headers" in resource_dict and resource_dict["headers"]:
148 | headers_found += 1
149 | self.assertIn("HTTP/1.0", resource_dict["headers"],
150 | "Headers should contain HTTP status line")
151 |
152 | # at least some resources should have parsed headers
153 | self.assertGreater(headers_found, 0, "Should find resources with parsed headers")
154 |
155 | def test_archivebox_timestamped_structure(self):
156 | """
157 | Test handling of ArchiveBox's timestamped entry structure.
158 | """
159 | crawler = ArchiveBoxCrawler(self._datasrc)
160 |
161 | # get resources with timestamps from pragmar site
162 | timestamp_resources = crawler.get_resources_api(
163 | sites=[PRAGMAR_SITE_ID],
164 | fields=["created", "modified"],
165 | limit=10
166 | )
167 |
168 | self.assertGreater(timestamp_resources.total, 0, "Should have timestamped resources")
169 |
170 | for resource in timestamp_resources._results:
171 | resource_dict = resource.to_dict()
172 |
173 | # should have timestamp information
174 | self.assertIsNotNone(resource_dict.get("created"),
175 | "Should have created timestamp from entry directory")
176 | self.assertIsNotNone(resource_dict.get("modified"),
177 | "Should have modified timestamp from entry directory")
178 |
179 | def test_archivebox_error_resilience(self):
180 | """
181 | Test resilience to malformed JSON and missing files.
182 | """
183 | crawler = ArchiveBoxCrawler(self._datasrc)
184 |
185 | # should continue processing despite any JSON parsing errors
186 | all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID])
187 |
188 | # verify we got some resources despite potential errors
189 | self.assertGreater(all_resources.total, 0,
190 | "Should process entries even with JSON parsing errors")
191 |
192 | # verify resources have reasonable defaults
193 | for resource in all_resources._results:
194 | self.assertIsNotNone(resource.url, "URL should always be set")
195 | self.assertIsInstance(resource.status, int, "Status should be integer")
196 | self.assertGreaterEqual(resource.status, 0, "Status should be non-negative")
197 | self.assertLessEqual(resource.status, 599, "Status should be valid HTTP status")
198 |
199 | def test_archivebox_multi_site(self):
200 | """
201 | Test that multiple ArchiveBox working directories are treated as separate sites.
202 | """
203 | crawler = ArchiveBoxCrawler(self._datasrc)
204 |
205 | # get resources from each site separately
206 | example_resources = crawler.get_resources_api(sites=[EXAMPLE_SITE_ID], limit=10)
207 | pragmar_resources = crawler.get_resources_api(
208 | query="url: pragmar.com",
209 | sites=[PRAGMAR_SITE_ID],
210 | limit=10)
211 |
212 | # print(example_resources.to_dict())
213 | # print(pragmar_resources.to_dict())
214 |
215 | # both sites should have resources
216 | self.assertGreater(example_resources.total, 0, "Example site should have resources")
217 | self.assertGreater(pragmar_resources.total, 0, "Pragmar site should have resources")
218 |
219 | # URLs should reflect the appropriate domains
220 | example_urls = [r.url for r in example_resources._results]
221 | pragmar_urls = [r.url for r in pragmar_resources._results]
222 |
223 | # verify site separation (pragmar resources should be about pragmar.com)
224 | pragmar_domain_urls = [url for url in pragmar_urls if "pragmar.com" in url]
225 | self.assertGreater(len(pragmar_domain_urls), 0,
226 | "Pragmar site should contain pragmar.com URLs")
227 |
228 | def test_report(self):
229 | """
230 | Run test report for ArchiveBox archive.
231 | """
232 | crawler = ArchiveBoxCrawler(self._datasrc)
233 |
234 | # generate report using pragmar site ID
235 | report = self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "ArchiveBox")
236 | logger.info(report)
237 |
238 | # basic validation that report contains expected content
239 | self.assertIn("ArchiveBox", report, "Report should mention ArchiveBox")
240 | self.assertIn("Total pages:", report, "Report should show page counts")
```
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>mcp-server-webcrawl — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="next" title="Installation" href="installation.html" />
23 | </head>
24 |
25 | <body class="wy-body-for-nav">
26 | <div class="wy-grid-for-nav">
27 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
28 | <div class="wy-side-scroll">
29 | <div class="wy-side-nav-search" >
30 |
31 |
32 |
33 | <a href="#" class="icon icon-home">
34 | mcp-server-webcrawl
35 | </a>
36 | <div role="search">
37 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
38 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
39 | <input type="hidden" name="check_keywords" value="yes" />
40 | <input type="hidden" name="area" value="default" />
41 | </form>
42 | </div>
43 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
44 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
45 | <ul>
46 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
50 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
51 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
52 | </ul>
53 |
54 | </div>
55 | </div>
56 | </nav>
57 |
58 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
59 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
60 | <a href="#">mcp-server-webcrawl</a>
61 | </nav>
62 |
63 | <div class="wy-nav-content">
64 | <div class="rst-content">
65 | <div role="navigation" aria-label="Page navigation">
66 | <ul class="wy-breadcrumbs">
67 | <li><a href="#" class="icon icon-home" aria-label="Home"></a></li>
68 | <li class="breadcrumb-item active">mcp-server-webcrawl</li>
69 | <li class="wy-breadcrumbs-aside">
70 | <a href="_sources/index.rst.txt" rel="nofollow"> View page source</a>
71 | </li>
72 | </ul>
73 | <hr/>
74 | </div>
75 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
76 | <div itemprop="articleBody">
77 |
78 | <a class="reference internal image-reference" href="_images/mcpswc.svg"><img alt="mcp-server-webcrawl heading" class="align-center" src="_images/mcpswc.svg" width="100%" /></a>
79 | <div style="text-align: center; margin-bottom: 2em;">
80 | <a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 4px;">Website</a> |
81 | <a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 4px;">Github</a> |
82 | <a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 4px;">Docs</a> |
83 | <a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 4px;">PyPi</a>
84 |
85 | </div><section id="mcp-server-webcrawl">
86 | <h1>mcp-server-webcrawl<a class="headerlink" href="#mcp-server-webcrawl" title="Link to this heading"></a></h1>
87 | <p>Advanced search and retrieval for web crawler data. With <strong>mcp-server-webcrawl</strong>, your AI client filters
88 | and analyzes web content under your direction or autonomously. The server includes a full-text search
89 | interface with boolean support, and resource filtering by type, HTTP status, and more.</p>
90 | <p><strong>mcp-server-webcrawl</strong> provides the LLM a complete menu with which to search your web content, and works with
91 | a variety of web crawlers:</p>
92 | <table class="docutils align-default" id="id7">
93 | <caption><span class="caption-text">Supported Crawlers</span><a class="headerlink" href="#id7" title="Link to this table"></a></caption>
94 | <colgroup>
95 | <col style="width: 30.0%" />
96 | <col style="width: 50.0%" />
97 | <col style="width: 20.0%" />
98 | </colgroup>
99 | <thead>
100 | <tr class="row-odd"><th class="head"><p>Crawler/Format</p></th>
101 | <th class="head"><p>Description</p></th>
102 | <th class="head"><p>Setup Guide</p></th>
103 | </tr>
104 | </thead>
105 | <tbody>
106 | <tr class="row-even"><td><p><a class="reference external" href="https://archivebox.io">ArchiveBox</a></p></td>
107 | <td><p>Self-hosted web archiving tool</p></td>
108 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html">Setup Guide</a></p></td>
109 | </tr>
110 | <tr class="row-odd"><td><p><a class="reference external" href="https://www.httrack.com">HTTrack</a></p></td>
111 | <td><p>GUI/CLI website mirroring tool</p></td>
112 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html">Setup Guide</a></p></td>
113 | </tr>
114 | <tr class="row-even"><td><p><a class="reference external" href="https://interro.bot">InterroBot</a></p></td>
115 | <td><p>GUI crawler and analyzer</p></td>
116 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html">Setup Guide</a></p></td>
117 | </tr>
118 | <tr class="row-odd"><td><p><a class="reference external" href="https://github.com/projectdiscovery/katana">Katana</a></p></td>
119 | <td><p>CLI security-focused crawler</p></td>
120 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html">Setup Guide</a></p></td>
121 | </tr>
122 | <tr class="row-even"><td><p><a class="reference external" href="https://crawler.siteone.io">SiteOne</a></p></td>
123 | <td><p>GUI crawler and analyzer</p></td>
124 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html">Setup Guide</a></p></td>
125 | </tr>
126 | <tr class="row-odd"><td><p><a class="reference external" href="https://en.wikipedia.org/wiki/WARC_(file_format)">WARC</a></p></td>
127 | <td><p>Standard web archive format</p></td>
128 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html">Setup Guide</a></p></td>
129 | </tr>
130 | <tr class="row-even"><td><p><a class="reference external" href="https://en.wikipedia.org/wiki/Wget">wget</a></p></td>
131 | <td><p>CLI website mirroring tool</p></td>
132 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html">Setup Guide</a></p></td>
133 | </tr>
134 | </tbody>
135 | </table>
136 | <p><strong>mcp-server-webcrawl</strong> is free and open source, and requires Claude Desktop, Python (>=3.10). It is installed on the command line, via pip install:</p>
137 | <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>mcp-server-webcrawl
138 | </pre></div>
139 | </div>
140 | <iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/Sid-GBxII1o" frameborder="0" allowfullscreen></iframe><div class="toctree-wrapper compound">
141 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
142 | <ul>
143 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
144 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
145 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
146 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
147 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
148 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
149 | </ul>
150 | </div>
151 | <section id="indices-and-tables">
152 | <h2>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Link to this heading"></a></h2>
153 | <ul class="simple">
154 | <li><p><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></p></li>
155 | <li><p><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></p></li>
156 | <li><p><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></p></li>
157 | </ul>
158 | </section>
159 | </section>
160 |
161 |
162 | </div>
163 | </div>
164 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
165 | <a href="installation.html" class="btn btn-neutral float-right" title="Installation" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
166 | </div>
167 |
168 | <hr/>
169 |
170 | <div role="contentinfo">
171 | <p>© Copyright 2025, pragmar.</p>
172 | </div>
173 |
174 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
175 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
176 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
177 |
178 |
179 | </footer>
180 | </div>
181 | </div>
182 | </section>
183 | </div>
184 | <script>
185 | jQuery(function () {
186 | SphinxRtdTheme.Navigation.enable(true);
187 | });
188 | </script>
189 |
190 | </body>
191 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/adapter.py:
--------------------------------------------------------------------------------
```python
1 | import email.utils
2 | import os
3 | import sqlite3
4 | import warcio
5 |
6 | from contextlib import closing
7 | from datetime import datetime, timezone
8 | from pathlib import Path
9 | from typing import Final
10 | from warcio.recordloader import ArcWarcRecord
11 |
12 | from mcp_server_webcrawl.crawlers.base.adapter import (
13 | IndexState,
14 | IndexStatus,
15 | SitesGroup,
16 | INDEXED_BATCH_SIZE,
17 | INDEXED_WARC_EXTENSIONS,
18 | )
19 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
20 | from mcp_server_webcrawl.models.resources import (
21 | ResourceResult,
22 | ResourceResultType,
23 | RESOURCES_LIMIT_DEFAULT,
24 | )
25 | from mcp_server_webcrawl.models.sites import (
26 | SiteResult,
27 | SiteType,
28 | SITES_FIELDS_DEFAULT,
29 | SITES_FIELDS_BASE,
30 | )
31 | from mcp_server_webcrawl.utils.logger import get_logger
32 |
33 | logger = get_logger()
34 |
35 |
36 | class WarcManager(IndexedManager):
37 | """
38 | Manages WARC file data in in-memory SQLite databases.
39 | Provides connection pooling and caching for efficient access.
40 | """
41 |
42 | def __init__(self) -> None:
43 | """Initialize the WARC manager with empty cache and statistics."""
44 | super().__init__()
45 |
46 | def _load_site_data(self, connection: sqlite3.Connection, warc_path: Path,
47 | site_id: int, index_state: IndexState = None) -> None:
48 | """
49 | Load a WARC file into the database with batch processing for better performance.
50 |
51 | Args:
52 | connection: SQLite connection
53 | warc_path: path to the WARC file
54 | site_id: ID for the site
55 | index_state: IndexState object for tracking progress
56 | """
57 | if not warc_path.exists() or not warc_path.is_file():
58 | logger.error(f"WARC file not found or not a file: {warc_path}")
59 | return
60 |
61 | with closing(connection.cursor()) as cursor:
62 | if index_state is not None:
63 | index_state.set_status(IndexStatus.INDEXING)
64 | try:
65 | batch_insert_resource_results: list[ResourceResult] = []
66 | batch_count: int = 0
67 | with open(warc_path, "rb") as stream:
68 | for warc_record in warcio.ArchiveIterator(stream):
69 |
70 | if index_state is not None and index_state.is_timeout():
71 | index_state.set_status(IndexStatus.PARTIAL)
72 | # commit current batch and shut it down
73 | if batch_insert_resource_results:
74 | self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
75 | return
76 |
77 | if warc_record is not None and warc_record.rec_type == "response":
78 | resource_result: ResourceResult = self._prepare_warc_record(warc_record, site_id)
79 | if resource_result:
80 | batch_insert_resource_results.append(resource_result)
81 | if index_state is not None:
82 | index_state.increment_processed()
83 |
84 | batch_count += 1
85 | if batch_count >= INDEXED_BATCH_SIZE:
86 | self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
87 | batch_insert_resource_results = []
88 | batch_count = 0
89 |
90 | # batch insert remaining
91 | if batch_insert_resource_results:
92 | self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
93 |
94 | if index_state is not None and index_state.status == IndexStatus.INDEXING:
95 | index_state.set_status(IndexStatus.COMPLETE)
96 |
97 | except Exception as ex:
98 | logger.error(f"Error processing WARC file {warc_path}: {ex}")
99 | if index_state is not None:
100 | index_state.set_status(IndexStatus.FAILED)
101 |
102 | def _prepare_warc_record(self, record: ArcWarcRecord, site_id: int) -> ResourceResult | None:
103 | """
104 | Prepare a WARC record for batch insertion.
105 |
106 | Args:
107 | record: a warcio record object
108 | site_id: ID for the site
109 |
110 | Returns:
111 | Tuple of values ready for insertion, or None if processing fails
112 | """
113 | try:
114 | url: str = record.rec_headers.get_header("WARC-Target-URI")
115 | content_type: str = record.http_headers.get_header("Content-Type", "")
116 | status: int = int(record.http_headers.get_statuscode()) or 200
117 | resource_type: ResourceResultType = self._determine_resource_type(content_type)
118 | content: bytes = record.content_stream().read()
119 | content_size: int = len(content)
120 |
121 | if self._is_text_content(content_type):
122 | try:
123 | content_str: str = content.decode("utf-8")
124 | except UnicodeDecodeError:
125 | content_str = None
126 | else:
127 | content_str = None
128 |
129 | warc_date = record.rec_headers.get_header("WARC-Date")
130 | if warc_date:
131 | try:
132 | file_created = datetime.fromisoformat(warc_date.replace('Z', '+00:00'))
133 | except ValueError:
134 | # Fallback to email date parser
135 | try:
136 | time_tuple = email.utils.parsedate_tz(warc_date)
137 | file_created = datetime.fromtimestamp(email.utils.mktime_tz(time_tuple), tz=timezone.utc)
138 | except (ValueError, TypeError):
139 | file_created = datetime.now(timezone.utc)
140 | else:
141 | file_created = None # don't pretend it is now, ResourceResult can survive
142 | file_modified = file_created # like file stat indexes, these are equivalent
143 |
144 | result = ResourceResult(
145 | id=IndexedManager.string_to_id(url),
146 | site=site_id,
147 | created=file_created,
148 | modified=file_modified,
149 | url=url,
150 | type=resource_type,
151 | status=status,
152 | headers=record.http_headers.to_str(),
153 | content=content_str,
154 | size=content_size,
155 | time=0 # time not available
156 | )
157 | return result
158 | except Exception as ex:
159 | logger.error(f"Error processing WARC record for URL {url if 'url' in locals() else 'unknown'}: {ex}")
160 | return None
161 |
162 | manager: WarcManager = WarcManager()
163 |
164 | def get_sites(
165 | datasrc: Path,
166 | ids: list[int] | None = None,
167 | fields: list[str] | None = None
168 | ) -> list[SiteResult]:
169 | """
170 | List WARC files in the datasrc directory as sites.
171 |
172 | Args:
173 | datasrc: path to the directory containing WARC files
174 | ids: optional list of site IDs to filter by
175 | fields: list of fields to include in the response
176 |
177 | Returns:
178 | List of SiteResult objects, one for each WARC file
179 | """
180 | assert datasrc is not None, f"datasrc not provided ({datasrc})"
181 |
182 | # nothing can be done, but don't crash the server either, keep chugging along
183 | if not datasrc.exists():
184 | logger.error(f"Directory not found ({datasrc})")
185 | return []
186 |
187 | # determine which fields to include
188 | selected_fields: set[str] = set(SITES_FIELDS_BASE)
189 | if fields:
190 | valid_fields: set[str] = set(SITES_FIELDS_DEFAULT)
191 | selected_fields.update(f for f in fields if f in valid_fields)
192 | else:
193 | selected_fields.update(SITES_FIELDS_DEFAULT)
194 |
195 | results: list[SiteResult] = []
196 |
197 | files_to_check: list[Path] = []
198 | for ext in INDEXED_WARC_EXTENSIONS:
199 | files_to_check.extend(datasrc.glob(f"*{ext}"))
200 |
201 | # map of file_id -> file_path for filtering
202 | file_id_map: dict[int, Path] = {WarcManager.string_to_id(str(os.path.basename(f))): f for f in files_to_check if f is not None}
203 |
204 | if ids:
205 | file_id_map = {id_val: path for id_val, path in file_id_map.items() if id_val in ids}
206 |
207 |
208 | # for site_id, file_path in sorted(file_id_map.items()):
209 | # file_stat = file_path.stat()
210 | # created_time: datetime = datetime.fromtimestamp(file_stat.st_ctime)
211 | # modified_time: datetime = datetime.fromtimestamp(file_stat.st_mtime)
212 | # site: SiteResult = SiteResult(
213 | # path=file_path,
214 | # id=site_id,
215 | # url=str(file_path.absolute()),
216 | # created=created_time if "created" in selected_fields else None,
217 | # modified=modified_time if "modified" in selected_fields else None,
218 | # )
219 | # results.append(site)
220 |
221 | for site_id, file_path in sorted(file_id_map.items()):
222 | file_stat = file_path.stat()
223 | created_time: datetime = datetime.fromtimestamp(file_stat.st_ctime)
224 | modified_time: datetime = datetime.fromtimestamp(file_stat.st_mtime)
225 | site: SiteResult = SiteResult(
226 | path=file_path,
227 | id=site_id,
228 | name=file_path.name, # NEW: just the filename
229 | type=SiteType.CRAWLED_URL, # NEW: treated as single-site crawl
230 | urls=[str(file_path.absolute())], # CHANGED: now a list (file path as the "URL")
231 | created=created_time if "created" in selected_fields else None,
232 | modified=modified_time if "modified" in selected_fields else None,
233 | )
234 | results.append(site)
235 | return results
236 |
237 | def get_resources(
238 | datasrc: Path,
239 | sites: list[int] | None = None,
240 | query: str = "",
241 | fields: list[str] | None = None,
242 | sort: str | None = None,
243 | limit: int = RESOURCES_LIMIT_DEFAULT,
244 | offset: int = 0,
245 | ) -> tuple[list[ResourceResult], int, IndexState]:
246 | """
247 | Get resources from wget directories using in-memory SQLite.
248 |
249 | Args:
250 | datasrc: path to the directory containing wget captures
251 | sites: optional list of site IDs to filter by
252 | query: search query string
253 | fields: optional list of fields to include in response
254 | sort: sort order for results
255 | limit: maximum number of results to return
256 | offset: number of results to skip for pagination
257 |
258 | Returns:
259 | Tuple of (list of ResourceResult objects, total count)
260 | """
261 | sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
262 | assert sites_results, "At least one site is required to search"
263 | site_paths = [site.path for site in sites_results]
264 | sites_group = SitesGroup(datasrc, sites, site_paths)
265 | return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
266 |
```