#
tokens: 47688/50000 16/216 files (page 3/35)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 3 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│   ├── _images
│   │   ├── interactive.document.webp
│   │   ├── interactive.search.webp
│   │   └── mcpswc.svg
│   ├── _modules
│   │   ├── index.html
│   │   ├── mcp_server_webcrawl
│   │   │   ├── crawlers
│   │   │   │   ├── archivebox
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── base
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── api.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   ├── indexed.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── httrack
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── interrobot
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── katana
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── siteone
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── warc
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   └── wget
│   │   │   │       ├── adapter.html
│   │   │   │       ├── crawler.html
│   │   │   │       └── tests.html
│   │   │   ├── crawlers.html
│   │   │   ├── extras
│   │   │   │   ├── markdown.html
│   │   │   │   ├── regex.html
│   │   │   │   ├── snippets.html
│   │   │   │   ├── thumbnails.html
│   │   │   │   └── xpath.html
│   │   │   ├── interactive
│   │   │   │   ├── highlights.html
│   │   │   │   ├── search.html
│   │   │   │   ├── session.html
│   │   │   │   └── ui.html
│   │   │   ├── main.html
│   │   │   ├── models
│   │   │   │   ├── resources.html
│   │   │   │   └── sites.html
│   │   │   ├── templates
│   │   │   │   └── tests.html
│   │   │   ├── utils
│   │   │   │   ├── blobs.html
│   │   │   │   ├── cli.html
│   │   │   │   ├── logger.html
│   │   │   │   ├── querycache.html
│   │   │   │   ├── server.html
│   │   │   │   └── tools.html
│   │   │   └── utils.html
│   │   └── re.html
│   ├── _sources
│   │   ├── guides
│   │   │   ├── archivebox.rst.txt
│   │   │   ├── httrack.rst.txt
│   │   │   ├── interrobot.rst.txt
│   │   │   ├── katana.rst.txt
│   │   │   ├── siteone.rst.txt
│   │   │   ├── warc.rst.txt
│   │   │   └── wget.rst.txt
│   │   ├── guides.rst.txt
│   │   ├── index.rst.txt
│   │   ├── installation.rst.txt
│   │   ├── interactive.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.base.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│   │   ├── mcp_server_webcrawl.extras.rst.txt
│   │   ├── mcp_server_webcrawl.interactive.rst.txt
│   │   ├── mcp_server_webcrawl.models.rst.txt
│   │   ├── mcp_server_webcrawl.rst.txt
│   │   ├── mcp_server_webcrawl.templates.rst.txt
│   │   ├── mcp_server_webcrawl.utils.rst.txt
│   │   ├── modules.rst.txt
│   │   ├── prompts.rst.txt
│   │   └── usage.rst.txt
│   ├── _static
│   │   ├── _sphinx_javascript_frameworks_compat.js
│   │   ├── basic.css
│   │   ├── css
│   │   │   ├── badge_only.css
│   │   │   ├── fonts
│   │   │   │   ├── fontawesome-webfont.eot
│   │   │   │   ├── fontawesome-webfont.svg
│   │   │   │   ├── fontawesome-webfont.ttf
│   │   │   │   ├── fontawesome-webfont.woff
│   │   │   │   ├── fontawesome-webfont.woff2
│   │   │   │   ├── lato-bold-italic.woff
│   │   │   │   ├── lato-bold-italic.woff2
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-normal-italic.woff
│   │   │   │   ├── lato-normal-italic.woff2
│   │   │   │   ├── lato-normal.woff
│   │   │   │   ├── lato-normal.woff2
│   │   │   │   ├── Roboto-Slab-Bold.woff
│   │   │   │   ├── Roboto-Slab-Bold.woff2
│   │   │   │   ├── Roboto-Slab-Regular.woff
│   │   │   │   └── Roboto-Slab-Regular.woff2
│   │   │   └── theme.css
│   │   ├── doctools.js
│   │   ├── documentation_options.js
│   │   ├── file.png
│   │   ├── fonts
│   │   │   ├── Lato
│   │   │   │   ├── lato-bold.eot
│   │   │   │   ├── lato-bold.ttf
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-bolditalic.eot
│   │   │   │   ├── lato-bolditalic.ttf
│   │   │   │   ├── lato-bolditalic.woff
│   │   │   │   ├── lato-bolditalic.woff2
│   │   │   │   ├── lato-italic.eot
│   │   │   │   ├── lato-italic.ttf
│   │   │   │   ├── lato-italic.woff
│   │   │   │   ├── lato-italic.woff2
│   │   │   │   ├── lato-regular.eot
│   │   │   │   ├── lato-regular.ttf
│   │   │   │   ├── lato-regular.woff
│   │   │   │   └── lato-regular.woff2
│   │   │   └── RobotoSlab
│   │   │       ├── roboto-slab-v7-bold.eot
│   │   │       ├── roboto-slab-v7-bold.ttf
│   │   │       ├── roboto-slab-v7-bold.woff
│   │   │       ├── roboto-slab-v7-bold.woff2
│   │   │       ├── roboto-slab-v7-regular.eot
│   │   │       ├── roboto-slab-v7-regular.ttf
│   │   │       ├── roboto-slab-v7-regular.woff
│   │   │       └── roboto-slab-v7-regular.woff2
│   │   ├── images
│   │   │   ├── interactive.document.png
│   │   │   ├── interactive.document.webp
│   │   │   ├── interactive.search.png
│   │   │   ├── interactive.search.webp
│   │   │   └── mcpswc.svg
│   │   ├── jquery.js
│   │   ├── js
│   │   │   ├── badge_only.js
│   │   │   ├── theme.js
│   │   │   └── versions.js
│   │   ├── language_data.js
│   │   ├── minus.png
│   │   ├── plus.png
│   │   ├── pygments.css
│   │   ├── searchtools.js
│   │   └── sphinx_highlight.js
│   ├── .buildinfo
│   ├── .nojekyll
│   ├── genindex.html
│   ├── guides
│   │   ├── archivebox.html
│   │   ├── httrack.html
│   │   ├── interrobot.html
│   │   ├── katana.html
│   │   ├── siteone.html
│   │   ├── warc.html
│   │   └── wget.html
│   ├── guides.html
│   ├── index.html
│   ├── installation.html
│   ├── interactive.html
│   ├── mcp_server_webcrawl.crawlers.archivebox.html
│   ├── mcp_server_webcrawl.crawlers.base.html
│   ├── mcp_server_webcrawl.crawlers.html
│   ├── mcp_server_webcrawl.crawlers.httrack.html
│   ├── mcp_server_webcrawl.crawlers.interrobot.html
│   ├── mcp_server_webcrawl.crawlers.katana.html
│   ├── mcp_server_webcrawl.crawlers.siteone.html
│   ├── mcp_server_webcrawl.crawlers.warc.html
│   ├── mcp_server_webcrawl.crawlers.wget.html
│   ├── mcp_server_webcrawl.extras.html
│   ├── mcp_server_webcrawl.html
│   ├── mcp_server_webcrawl.interactive.html
│   ├── mcp_server_webcrawl.models.html
│   ├── mcp_server_webcrawl.templates.html
│   ├── mcp_server_webcrawl.utils.html
│   ├── modules.html
│   ├── objects.inv
│   ├── prompts.html
│   ├── py-modindex.html
│   ├── search.html
│   ├── searchindex.js
│   └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│   ├── audit404.md
│   ├── auditfiles.md
│   ├── auditperf.md
│   ├── auditseo.md
│   ├── gopher.md
│   ├── README.md
│   └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│   ├── _static
│   │   └── images
│   │       ├── interactive.document.png
│   │       ├── interactive.document.webp
│   │       ├── interactive.search.png
│   │       ├── interactive.search.webp
│   │       └── mcpswc.svg
│   ├── _templates
│   │   └── layout.html
│   ├── conf.py
│   ├── guides
│   │   ├── archivebox.rst
│   │   ├── httrack.rst
│   │   ├── interrobot.rst
│   │   ├── katana.rst
│   │   ├── siteone.rst
│   │   ├── warc.rst
│   │   └── wget.rst
│   ├── guides.rst
│   ├── index.rst
│   ├── installation.rst
│   ├── interactive.rst
│   ├── make.bat
│   ├── Makefile
│   ├── mcp_server_webcrawl.crawlers.archivebox.rst
│   ├── mcp_server_webcrawl.crawlers.base.rst
│   ├── mcp_server_webcrawl.crawlers.httrack.rst
│   ├── mcp_server_webcrawl.crawlers.interrobot.rst
│   ├── mcp_server_webcrawl.crawlers.katana.rst
│   ├── mcp_server_webcrawl.crawlers.rst
│   ├── mcp_server_webcrawl.crawlers.siteone.rst
│   ├── mcp_server_webcrawl.crawlers.warc.rst
│   ├── mcp_server_webcrawl.crawlers.wget.rst
│   ├── mcp_server_webcrawl.extras.rst
│   ├── mcp_server_webcrawl.interactive.rst
│   ├── mcp_server_webcrawl.models.rst
│   ├── mcp_server_webcrawl.rst
│   ├── mcp_server_webcrawl.templates.rst
│   ├── mcp_server_webcrawl.utils.rst
│   ├── modules.rst
│   ├── prompts.rst
│   ├── readme.txt
│   └── usage.rst
└── src
    └── mcp_server_webcrawl
        ├── __init__.py
        ├── crawlers
        │   ├── __init__.py
        │   ├── archivebox
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── base
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── api.py
        │   │   ├── crawler.py
        │   │   ├── indexed.py
        │   │   └── tests.py
        │   ├── httrack
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── interrobot
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── katana
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── siteone
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── warc
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   └── wget
        │       ├── __init__.py
        │       ├── adapter.py
        │       ├── crawler.py
        │       └── tests.py
        ├── extras
        │   ├── __init__.py
        │   ├── markdown.py
        │   ├── regex.py
        │   ├── snippets.py
        │   ├── thumbnails.py
        │   └── xpath.py
        ├── interactive
        │   ├── __init__.py
        │   ├── highlights.py
        │   ├── search.py
        │   ├── session.py
        │   ├── ui.py
        │   └── views
        │       ├── base.py
        │       ├── document.py
        │       ├── help.py
        │       ├── requirements.py
        │       ├── results.py
        │       └── searchform.py
        ├── main.py
        ├── models
        │   ├── __init__.py
        │   ├── base.py
        │   ├── resources.py
        │   └── sites.py
        ├── settings.py
        ├── templates
        │   ├── __init__.py
        │   ├── markdown.xslt
        │   ├── tests_core.html
        │   └── tests.py
        └── utils
            ├── __init__.py
            ├── cli.py
            ├── logger.py
            ├── parser.py
            ├── parsetab.py
            ├── search.py
            ├── server.py
            ├── tests.py
            └── tools.py
```

# Files

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/siteone/crawler.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.crawlers.siteone.crawler &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
 48 | </ul>
 49 | 
 50 |         </div>
 51 |       </div>
 52 |     </nav>
 53 | 
 54 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 55 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 56 |           <a href="../../../../index.html">mcp-server-webcrawl</a>
 57 |       </nav>
 58 | 
 59 |       <div class="wy-nav-content">
 60 |         <div class="rst-content">
 61 |           <div role="navigation" aria-label="Page navigation">
 62 |   <ul class="wy-breadcrumbs">
 63 |       <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 64 |           <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
 65 |           <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 66 |       <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.siteone.crawler</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <h1>Source code for mcp_server_webcrawl.crawlers.siteone.crawler</h1><div class="highlight"><pre>
 76 | <span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
 77 | 
 78 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
 79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.siteone.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
 80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 81 | 
 82 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 83 | 
 84 | <div class="viewcode-block" id="SiteOneCrawler">
 85 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.siteone.html#mcp_server_webcrawl.crawlers.siteone.crawler.SiteOneCrawler">[docs]</a>
 86 | <span class="k">class</span> <span class="nc">SiteOneCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
 87 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 88 | <span class="sd">    A crawler implementation for SiteOne captured sites.</span>
 89 | <span class="sd">    Provides functionality for accessing and searching web content from SiteOne captures.</span>
 90 | <span class="sd">    SiteOne merges a wget archive with a custom SiteOne generated log to aquire more</span>
 91 | <span class="sd">    fields than wget can alone.</span>
 92 | <span class="sd">    &quot;&quot;&quot;</span>
 93 | 
 94 | <div class="viewcode-block" id="SiteOneCrawler.__init__">
 95 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.siteone.html#mcp_server_webcrawl.crawlers.siteone.crawler.SiteOneCrawler.__init__">[docs]</a>
 96 |     <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
 97 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
 98 | <span class="sd">        Initialize the SiteOne crawler with a data source directory.</span>
 99 | 
100 | <span class="sd">        Args:</span>
101 | <span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
102 | <span class="sd">                SiteOne captures organized as subdirectories</span>
103 | 
104 | <span class="sd">        Raises:</span>
105 | <span class="sd">            AssertionError: If datasrc is None or not a directory</span>
106 | <span class="sd">        &quot;&quot;&quot;</span>
107 |         <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;SiteOneCrawler needs a datasrc, regardless of action&quot;</span>
108 |         <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;SiteOneCrawler datasrc must be a directory&quot;</span>
109 | 
110 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
111 | </div>
112 | 
113 | </pre></div>
114 | 
115 |            </div>
116 |           </div>
117 |           <footer>
118 | 
119 |   <hr/>
120 | 
121 |   <div role="contentinfo">
122 |     <p>&#169; Copyright 2025, pragmar.</p>
123 |   </div>
124 | 
125 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
126 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
127 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
128 |    
129 | 
130 | </footer>
131 |         </div>
132 |       </div>
133 |     </section>
134 |   </div>
135 |   <script>
136 |       jQuery(function () {
137 |           SphinxRtdTheme.Navigation.enable(true);
138 |       });
139 |   </script> 
140 | 
141 | </body>
142 | </html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import sqlite3
  3 | import traceback
  4 | import re
  5 | 
  6 | from datetime import timezone
  7 | from contextlib import closing
  8 | from datetime import datetime
  9 | from pathlib import Path
 10 | 
 11 | from mcp_server_webcrawl.crawlers.base.adapter import (
 12 |     BaseManager,
 13 |     IndexState,
 14 |     IndexStatus,
 15 |     SitesGroup,
 16 |     INDEXED_BATCH_SIZE,
 17 |     INDEXED_RESOURCE_DEFAULT_PROTOCOL,
 18 |     INDEXED_TYPE_MAPPING,
 19 |     INDEXED_IGNORE_DIRECTORIES,
 20 | )
 21 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
 22 | from mcp_server_webcrawl.models.resources import (
 23 |     ResourceResult,
 24 |     ResourceResultType,
 25 |     RESOURCES_LIMIT_DEFAULT,
 26 | )
 27 | from mcp_server_webcrawl.models.sites import (
 28 |     SiteResult,
 29 | )
 30 | from mcp_server_webcrawl.utils.logger import get_logger
 31 | 
 32 | logger = get_logger()
 33 | 
 34 | 
 35 | class WgetManager(IndexedManager):
 36 |     """
 37 |     Manages wget directory data in in-memory SQLite databases.
 38 |     Provides connection pooling and caching for efficient access.
 39 |     """
 40 | 
 41 |     def __init__(self) -> None:
 42 |         """Initialize the wget manager with empty cache and statistics."""
 43 |         super().__init__()
 44 | 
 45 |     def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
 46 |         site_id: int, index_state: IndexState = None) -> None:
 47 |         """
 48 |         Load a wget directory into the database with parallel processing and batch SQL insertions.
 49 | 
 50 |         Args:
 51 |             connection: SQLite connection
 52 |             directory: path to the wget directory
 53 |             site_id: id for the site
 54 |             index_state: indexState object for tracking progress
 55 |         """
 56 |         if not directory.exists() or not directory.is_dir():
 57 |             logger.error(f"Directory not found or not a directory: {directory}")
 58 |             return
 59 | 
 60 |         if index_state is not None:
 61 |             index_state.set_status(IndexStatus.INDEXING)
 62 | 
 63 |         # collect files to process
 64 |         file_paths = []
 65 |         for root, _, files in os.walk(directory):
 66 |             for filename in files:
 67 |                 if filename == "robots.txt":
 68 |                     continue
 69 | 
 70 |                 rel_path = Path(root).relative_to(directory)
 71 |                 ignore_file = False
 72 |                 for ignore_dir in INDEXED_IGNORE_DIRECTORIES:
 73 |                     if ignore_dir in str(rel_path):
 74 |                         ignore_file = True
 75 |                         break
 76 | 
 77 |                 if not ignore_file:
 78 |                     file_paths.append(Path(root) / filename)
 79 | 
 80 |         # each crawler a litle different
 81 |         with closing(connection.cursor()) as cursor:
 82 |             for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
 83 |                 if index_state is not None and index_state.is_timeout():
 84 |                     index_state.set_status(IndexStatus.PARTIAL)
 85 |                     return
 86 | 
 87 |                 batch_file_paths: list[Path] = file_paths[i:i+INDEXED_BATCH_SIZE]
 88 |                 batch_file_contents = BaseManager.read_files(batch_file_paths)
 89 |                 batch_insert_resource_results: list[ResourceResult] = []
 90 |                 for file_path, content in batch_file_contents.items():
 91 |                     try:
 92 |                         result: ResourceResult = self._prepare_wget_record(file_path, site_id, directory, content)
 93 |                         if result:
 94 |                             batch_insert_resource_results.append(result)
 95 |                             if index_state is not None:
 96 |                                 index_state.increment_processed()
 97 |                     except Exception as ex:
 98 |                         logger.error(f"Error processing file {file_path}: {ex}\n{traceback.format_exc()}")
 99 | 
100 |                 self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
101 | 
102 |             if index_state is not None and index_state.status == IndexStatus.INDEXING:
103 |                 index_state.set_status(IndexStatus.COMPLETE)
104 | 
105 |     def _prepare_wget_record(self, file_path: Path, site_id: int, base_dir: Path, content: str = None) -> ResourceResult | None:
106 |         """
107 |         Prepare a record for batch insertion from a wget file.
108 | 
109 |         Args:
110 |             file_path: path to the wget file
111 |             site_id: id for the site
112 |             base_dir: base directory for the wget capture
113 |             content: optional pre-loaded file content
114 | 
115 |         Returns:
116 |             Tuple of values ready for insertion, or None if processing fails
117 |         """
118 |         try:
119 |             relative_path = file_path.relative_to(base_dir)
120 |             url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{base_dir.name}/{str(relative_path).replace(os.sep, '/')}"
121 | 
122 |             # wget is creating ./index.html from ./ in most cases. eliminate it to preserve homepage sort
123 |             # which is way more important than the (wget manufactured) filename reference
124 |             url = re.sub(r"/index\.html($|\?)", r"/\1", url)
125 | 
126 |             decruftified_path = BaseManager.decruft_path(str(file_path))
127 |             extension = Path(decruftified_path).suffix.lower()
128 |             resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
129 |             file_stat = file_path.stat()
130 |             file_size = file_stat.st_size
131 |             file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
132 |             file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
133 | 
134 |             # use pre-loaded content if available, otherwise rely on read_file_contents
135 |             file_content = content
136 |             if file_content is None:
137 |                 file_content = BaseManager.read_file_contents(file_path, resource_type)
138 | 
139 |             return ResourceResult(
140 |                 id=BaseManager.string_to_id(url),
141 |                 site=site_id,
142 |                 created=file_created,
143 |                 modified=file_modified,
144 |                 url=url,
145 |                 type=resource_type,
146 |                 status=200,
147 |                 headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
148 |                 content=file_content,
149 |                 size=file_size,
150 |                 time=0,
151 |             )
152 |         except Exception as ex:
153 |             logger.error(f"Error preparing record for file {file_path}: {ex}")
154 |             return None
155 | 
156 | 
157 | manager: WgetManager = WgetManager()
158 | 
159 | def get_sites(
160 |         datasrc: Path,
161 |         ids: list[int] | None = None,
162 |         fields: list[str] | None = None
163 |     ) -> list[SiteResult]:
164 |     """
165 |     List site directories in the datasrc directory as sites.
166 | 
167 |     Args:
168 |         datasrc: path to the directory containing site subdirectories
169 |         ids: optional list of site IDs to filter by
170 |         fields: optional list of fields to include in the response
171 | 
172 |     Returns:
173 |         List of SiteResult objects, one for each site directory
174 | 
175 |     Notes:
176 |         Returns an empty list if the datasrc directory doesn't exist.
177 |     """
178 |     return manager.get_sites_for_directories(datasrc, ids, fields)
179 | 
180 | def get_resources(
181 |     datasrc: Path,
182 |     sites: list[int] | None = None,
183 |     query: str = "",
184 |     fields: list[str] | None = None,
185 |     sort: str | None = None,
186 |     limit: int = RESOURCES_LIMIT_DEFAULT,
187 |     offset: int = 0,
188 | 
189 | ) -> tuple[list[ResourceResult], int, IndexState]:
190 |     """
191 |     Get resources from wget directories using in-memory SQLite.
192 | 
193 |     Args:
194 |         datasrc: path to the directory containing wget captures
195 |         sites: optional list of site IDs to filter by
196 |         query: search query string
197 |         fields: optional list of fields to include in response
198 |         sort: sort order for results
199 |         limit: maximum number of results to return
200 |         offset: number of results to skip for pagination
201 | 
202 |     Returns:
203 |         Tuple of (list of ResourceResult objects, total count)
204 |     """
205 |     sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
206 |     assert sites_results, "At least one site is required to search"
207 |     site_paths = [site.path for site in sites_results]
208 |     sites_group = SitesGroup(datasrc, sites, site_paths)
209 |     return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
210 | 
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.utils &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../modules.html">mcp_server_webcrawl</a></li>
 48 | </ul>
 49 | 
 50 |         </div>
 51 |       </div>
 52 |     </nav>
 53 | 
 54 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 55 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 56 |           <a href="../../index.html">mcp-server-webcrawl</a>
 57 |       </nav>
 58 | 
 59 |       <div class="wy-nav-content">
 60 |         <div class="rst-content">
 61 |           <div role="navigation" aria-label="Page navigation">
 62 |   <ul class="wy-breadcrumbs">
 63 |       <li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 64 |           <li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
 65 |       <li class="breadcrumb-item active">mcp_server_webcrawl.utils</li>
 66 |       <li class="wy-breadcrumbs-aside">
 67 |       </li>
 68 |   </ul>
 69 |   <hr/>
 70 | </div>
 71 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 72 |            <div itemprop="articleBody">
 73 |              
 74 |   <h1>Source code for mcp_server_webcrawl.utils</h1><div class="highlight"><pre>
 75 | <span></span><span class="kn">import</span> <span class="nn">re</span>
 76 | 
 77 | <span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
 78 | 
 79 | <div class="viewcode-block" id="to_isoformat_zulu">
 80 | <a class="viewcode-back" href="../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.to_isoformat_zulu">[docs]</a>
 81 | <span class="k">def</span> <span class="nf">to_isoformat_zulu</span><span class="p">(</span><span class="n">dt</span><span class="p">:</span> <span class="n">datetime</span><span class="p">):</span>
 82 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 83 | <span class="sd">    Convert datetime to iso Z.</span>
 84 | 
 85 | <span class="sd">    python&lt;=3.10 struggles with Z and fractions of seconds, will</span>
 86 | <span class="sd">    throw. smooth out the iso string, second precision isn&#39;t key here</span>
 87 | <span class="sd">    &quot;&quot;&quot;</span>
 88 |     <span class="k">return</span> <span class="n">dt</span><span class="o">.</span><span class="n">isoformat</span><span class="p">()</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;+00:00&quot;</span><span class="p">,</span> <span class="s2">&quot;Z&quot;</span><span class="p">)</span></div>
 89 | 
 90 | 
 91 | <div class="viewcode-block" id="from_isoformat_zulu">
 92 | <a class="viewcode-back" href="../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.from_isoformat_zulu">[docs]</a>
 93 | <span class="k">def</span> <span class="nf">from_isoformat_zulu</span><span class="p">(</span><span class="n">dt_string</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">datetime</span><span class="p">:</span>
 94 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 95 | <span class="sd">    Convert ISO string to datetime.</span>
 96 | 
 97 | <span class="sd">    python&lt;=3.10 struggles with Z and fractions of seconds, will</span>
 98 | <span class="sd">    throw. smooth out the iso string, second precision isn&#39;t key here</span>
 99 | <span class="sd">    &quot;&quot;&quot;</span>
100 | 
101 |     <span class="k">if</span> <span class="ow">not</span> <span class="n">dt_string</span><span class="p">:</span>
102 |         <span class="k">return</span> <span class="kc">None</span>
103 |     <span class="n">dt_string</span> <span class="o">=</span> <span class="n">dt_string</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;Z&quot;</span><span class="p">,</span> <span class="s2">&quot;+00:00&quot;</span><span class="p">)</span>
104 |     <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;(.*\.\d</span><span class="si">{6}</span><span class="s2">)\d*([-+]\d</span><span class="si">{2}</span><span class="s2">:\d</span><span class="si">{2}</span><span class="s2">|$)&quot;</span><span class="p">,</span> <span class="n">dt_string</span><span class="p">)</span>
105 |     <span class="k">if</span> <span class="n">match</span><span class="p">:</span>
106 |         <span class="n">dt_string</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="ow">or</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
107 |     <span class="k">return</span> <span class="n">datetime</span><span class="o">.</span><span class="n">fromisoformat</span><span class="p">(</span><span class="n">dt_string</span><span class="p">)</span></div>
108 | 
109 | </pre></div>
110 | 
111 |            </div>
112 |           </div>
113 |           <footer>
114 | 
115 |   <hr/>
116 | 
117 |   <div role="contentinfo">
118 |     <p>&#169; Copyright 2025, pragmar.</p>
119 |   </div>
120 | 
121 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
122 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
123 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
124 |    
125 | 
126 | </footer>
127 |         </div>
128 |       </div>
129 |     </section>
130 |   </div>
131 |   <script>
132 |       jQuery(function () {
133 |           SphinxRtdTheme.Navigation.enable(true);
134 |       });
135 |   </script> 
136 | 
137 | </body>
138 | </html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/httrack/crawler.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.crawlers.httrack.crawler &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="../../../../index.html">mcp-server-webcrawl</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 65 |           <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
 66 |           <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 67 |       <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.httrack.crawler</li>
 68 |       <li class="wy-breadcrumbs-aside">
 69 |       </li>
 70 |   </ul>
 71 |   <hr/>
 72 | </div>
 73 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 74 |            <div itemprop="articleBody">
 75 |              
 76 |   <h1>Source code for mcp_server_webcrawl.crawlers.httrack.crawler</h1><div class="highlight"><pre>
 77 | <span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
 78 | 
 79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
 80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.httrack.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
 81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 82 | 
 83 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 84 | 
 85 | <div class="viewcode-block" id="HtTrackCrawler">
 86 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.httrack.html#mcp_server_webcrawl.crawlers.httrack.crawler.HtTrackCrawler">[docs]</a>
 87 | <span class="k">class</span> <span class="nc">HtTrackCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
 88 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 89 | <span class="sd">    A crawler implementation for HTTrack captured sites.</span>
 90 | <span class="sd">    Provides functionality for accessing and searching web content from HTTrack projects.</span>
 91 | <span class="sd">    HTTrack creates offline mirrors of websites with preserved directory structure</span>
 92 | <span class="sd">    and metadata in hts-log.txt files.</span>
 93 | <span class="sd">    &quot;&quot;&quot;</span>
 94 | 
 95 | <div class="viewcode-block" id="HtTrackCrawler.__init__">
 96 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.httrack.html#mcp_server_webcrawl.crawlers.httrack.crawler.HtTrackCrawler.__init__">[docs]</a>
 97 |     <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
 98 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
 99 | <span class="sd">        Initialize the HTTrack crawler with a data source directory.</span>
100 | 
101 | <span class="sd">        Args:</span>
102 | <span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
103 | <span class="sd">                HTTrack project directories, each potentially containing multiple domains</span>
104 | 
105 | <span class="sd">        Raises:</span>
106 | <span class="sd">            AssertionError: If datasrc is None or not a directory</span>
107 | <span class="sd">        &quot;&quot;&quot;</span>
108 |         <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;HtTrackCrawler needs a datasrc, regardless of action&quot;</span>
109 |         <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;HtTrackCrawler datasrc must be a directory&quot;</span>
110 | 
111 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
112 | </div>
113 | 
114 | </pre></div>
115 | 
116 |            </div>
117 |           </div>
118 |           <footer>
119 | 
120 |   <hr/>
121 | 
122 |   <div role="contentinfo">
123 |     <p>&#169; Copyright 2025, pragmar.</p>
124 |   </div>
125 | 
126 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
127 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
128 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
129 |    
130 | 
131 | </footer>
132 |         </div>
133 |       </div>
134 |     </section>
135 |   </div>
136 |   <script>
137 |       jQuery(function () {
138 |           SphinxRtdTheme.Navigation.enable(true);
139 |       });
140 |   </script> 
141 | 
142 | </body>
143 | </html>
```

--------------------------------------------------------------------------------
/docs/prompts.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="./">
  5 | <head>
  6 |   <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
  7 | 
  8 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  9 |   <title>Prompt Routines &mdash; mcp-server-webcrawl  documentation</title>
 10 |       <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
 11 |       <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
 12 | 
 13 |   
 14 |       <script src="_static/jquery.js?v=5d32c60e"></script>
 15 |       <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 16 |       <script src="_static/documentation_options.js?v=5929fcd5"></script>
 17 |       <script src="_static/doctools.js?v=888ff710"></script>
 18 |       <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 19 |     <script src="_static/js/theme.js"></script>
 20 |     <link rel="index" title="Index" href="genindex.html" />
 21 |     <link rel="search" title="Search" href="search.html" />
 22 |     <link rel="next" title="mcp_server_webcrawl" href="modules.html" />
 23 |     <link rel="prev" title="Usage" href="usage.html" /> 
 24 | </head>
 25 | 
 26 | <body class="wy-body-for-nav"> 
 27 |   <div class="wy-grid-for-nav">
 28 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 29 |       <div class="wy-side-scroll">
 30 |         <div class="wy-side-nav-search" >
 31 | 
 32 |           
 33 |           
 34 |           <a href="index.html" class="icon icon-home">
 35 |             mcp-server-webcrawl
 36 |           </a>
 37 | <div role="search">
 38 |   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
 39 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 40 |     <input type="hidden" name="check_keywords" value="yes" />
 41 |     <input type="hidden" name="area" value="default" />
 42 |   </form>
 43 | </div>
 44 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 45 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 46 | <ul class="current">
 47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
 50 | <li class="toctree-l1 current"><a class="current reference internal" href="#">Prompt Routines</a></li>
 51 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
 52 | </ul>
 53 | 
 54 |         </div>
 55 |       </div>
 56 |     </nav>
 57 | 
 58 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 59 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 60 |           <a href="index.html">mcp-server-webcrawl</a>
 61 |       </nav>
 62 | 
 63 |       <div class="wy-nav-content">
 64 |         <div class="rst-content">
 65 |           <div role="navigation" aria-label="Page navigation">
 66 |   <ul class="wy-breadcrumbs">
 67 |       <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
 68 |       <li class="breadcrumb-item active">Prompt Routines</li>
 69 |       <li class="wy-breadcrumbs-aside">
 70 |             <a href="_sources/prompts.rst.txt" rel="nofollow"> View page source</a>
 71 |       </li>
 72 |   </ul>
 73 |   <hr/>
 74 | </div>
 75 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 76 |            <div itemprop="articleBody">
 77 |              
 78 |   <section id="prompt-routines">
 79 | <h1>Prompt Routines<a class="headerlink" href="#prompt-routines" title="Link to this heading"></a></h1>
 80 | <p><strong>mcp-server-webcrawl</strong> provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.</p>
 81 | <p>It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are <strong>copy and paste</strong>, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.</p>
 82 | <p>If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type “run pasted for [site name or URL].” It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).</p>
 83 | <table class="docutils align-default">
 84 | <thead>
 85 | <tr class="row-odd"><th class="head"><p>Prompt</p></th>
 86 | <th class="head"><p>Download</p></th>
 87 | <th class="head"><p>Category</p></th>
 88 | <th class="head"><p>Description</p></th>
 89 | </tr>
 90 | </thead>
 91 | <tbody>
 92 | <tr class="row-even"><td><p>🔍 <strong>SEO Audit</strong></p></td>
 93 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md">auditseo.md</a></p></td>
 94 | <td><p>audit</p></td>
 95 | <td><p>Technical SEO (search engine optimization) analysis. Covers the
 96 | basics, with options to dive deeper.</p></td>
 97 | </tr>
 98 | <tr class="row-odd"><td><p>🔗 <strong>404 Audit</strong></p></td>
 99 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md">audit404.md</a></p></td>
100 | <td><p>audit</p></td>
101 | <td><p>Broken link detection and pattern analysis. Not only finds issues,
102 | but suggests fixes.</p></td>
103 | </tr>
104 | <tr class="row-even"><td><p>⚡ <strong>Performance Audit</strong></p></td>
105 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md">auditperf.md</a></p></td>
106 | <td><p>audit</p></td>
107 | <td><p>Website speed and optimization analysis. Real talk.</p></td>
108 | </tr>
109 | <tr class="row-odd"><td><p>📁 <strong>File Audit</strong></p></td>
110 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md">auditfiles.md</a></p></td>
111 | <td><p>audit</p></td>
112 | <td><p>File organization and asset analysis. Discover the composition of
113 | your website.</p></td>
114 | </tr>
115 | <tr class="row-even"><td><p>🌐 <strong>Gopher Interface</strong></p></td>
116 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md">gopher.md</a></p></td>
117 | <td><p>interface</p></td>
118 | <td><p>An old-fashioned search interface inspired by the Gopher clients of
119 | yesteryear.</p></td>
120 | </tr>
121 | <tr class="row-odd"><td><p>⚙️ <strong>Search Test</strong></p></td>
122 | <td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md">testsearch.md</a></p></td>
123 | <td><p>self-test</p></td>
124 | <td><p>A battery of tests to check for Boolean logical inconsistencies in
125 | the search query parser and subsequent FTS5 conversion.</p></td>
126 | </tr>
127 | </tbody>
128 | </table>
129 | </section>
130 | 
131 | 
132 |            </div>
133 |           </div>
134 |           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
135 |         <a href="usage.html" class="btn btn-neutral float-left" title="Usage" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
136 |         <a href="modules.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
137 |     </div>
138 | 
139 |   <hr/>
140 | 
141 |   <div role="contentinfo">
142 |     <p>&#169; Copyright 2025, pragmar.</p>
143 |   </div>
144 | 
145 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
146 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
147 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
148 |    
149 | 
150 | </footer>
151 |         </div>
152 |       </div>
153 |     </section>
154 |   </div>
155 |   <script>
156 |       jQuery(function () {
157 |           SphinxRtdTheme.Navigation.enable(true);
158 |       });
159 |   </script> 
160 | 
161 | </body>
162 | </html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/archivebox/crawler.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.crawlers.archivebox.crawler &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="../../../../index.html">mcp-server-webcrawl</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 65 |           <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
 66 |           <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 67 |       <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.archivebox.crawler</li>
 68 |       <li class="wy-breadcrumbs-aside">
 69 |       </li>
 70 |   </ul>
 71 |   <hr/>
 72 | </div>
 73 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 74 |            <div itemprop="articleBody">
 75 |              
 76 |   <h1>Source code for mcp_server_webcrawl.crawlers.archivebox.crawler</h1><div class="highlight"><pre>
 77 | <span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
 78 | 
 79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
 80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.archivebox.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
 81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 82 | 
 83 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 84 | 
 85 | <div class="viewcode-block" id="ArchiveBoxCrawler">
 86 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.archivebox.html#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler">[docs]</a>
 87 | <span class="k">class</span> <span class="nc">ArchiveBoxCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
 88 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 89 | <span class="sd">    A crawler implementation for ArchiveBox archived sites.</span>
 90 | <span class="sd">    Provides functionality for accessing and searching web content from ArchiveBox archives.</span>
 91 | <span class="sd">    ArchiveBox creates single-URL archives with metadata stored in JSON files</span>
 92 | <span class="sd">    and HTML content preserved in index.html files.</span>
 93 | <span class="sd">    &quot;&quot;&quot;</span>
 94 | 
 95 | <div class="viewcode-block" id="ArchiveBoxCrawler.__init__">
 96 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.archivebox.html#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler.__init__">[docs]</a>
 97 |     <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
 98 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
 99 | <span class="sd">        Initialize the ArchiveBox crawler with a data source directory.</span>
100 | 
101 | <span class="sd">        Args:</span>
102 | <span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
103 | <span class="sd">                ArchiveBox archive directories, each containing individual URL entries</span>
104 | 
105 | <span class="sd">        Raises:</span>
106 | <span class="sd">            AssertionError: If datasrc is None or not a directory</span>
107 | <span class="sd">        &quot;&quot;&quot;</span>
108 |         <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;ArchiveBoxCrawler needs a datasrc, regardless of action&quot;</span>
109 |         <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;ArchiveBoxCrawler datasrc must be a directory&quot;</span>
110 | 
111 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
112 | </div>
113 | 
114 | </pre></div>
115 | 
116 |            </div>
117 |           </div>
118 |           <footer>
119 | 
120 |   <hr/>
121 | 
122 |   <div role="contentinfo">
123 |     <p>&#169; Copyright 2025, pragmar.</p>
124 |   </div>
125 | 
126 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
127 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
128 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
129 |    
130 | 
131 | </footer>
132 |         </div>
133 |       </div>
134 |     </section>
135 |   </div>
136 |   <script>
137 |       jQuery(function () {
138 |           SphinxRtdTheme.Navigation.enable(true);
139 |       });
140 |   </script> 
141 | 
142 | </body>
143 | </html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | import sqlite3
  3 | 
  4 | from itertools import chain
  5 | from contextlib import closing
  6 | from pathlib import Path
  7 | 
  8 | from datetime import datetime, timezone
  9 | 
 10 | from mcp_server_webcrawl.crawlers.base.adapter import (
 11 |     IndexState,
 12 |     IndexStatus,
 13 |     BaseManager,
 14 |     SitesGroup,
 15 |     INDEXED_BATCH_SIZE,
 16 | )
 17 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
 18 | from mcp_server_webcrawl.models.resources import (
 19 |     ResourceResult,
 20 |     ResourceResultType,
 21 |     RESOURCES_LIMIT_DEFAULT,
 22 | )
 23 | from mcp_server_webcrawl.models.sites import (
 24 |     SiteResult,
 25 | )
 26 | from mcp_server_webcrawl.utils.logger import get_logger
 27 | 
 28 | logger = get_logger()
 29 | 
 30 | KATANA_REGEX_HTTP_STATUS = re.compile(r"HTTP/\d\.\d\s+(\d+)")
 31 | KATANA_REGEX_CONTENT_TYPE = re.compile(r"Content-Type:\s*([^\r\n;]+)", re.IGNORECASE)
 32 | 
 33 | class KatanaManager(IndexedManager):
 34 |     """
 35 |     Manages HTTP text files in in-memory SQLite databases.
 36 |     Provides connection pooling and caching for efficient access.
 37 |     """
 38 | 
 39 |     def __init__(self) -> None:
 40 |         """Initialize the HTTP text manager with empty cache and statistics."""
 41 |         super().__init__()
 42 | 
 43 |     def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
 44 |             site_id: int, index_state: IndexState = None) -> None:
 45 |         """
 46 |         Load a site directory of HTTP text files into the database with parallel reading
 47 |         and batch SQL insertions.
 48 | 
 49 |         Args:
 50 |             connection: SQLite connection
 51 |             directory: path to the site directory
 52 |             site_id: ID for the site
 53 |             index_state: tracker for FTS indexing status
 54 |         """
 55 | 
 56 |         if not directory.exists() or not directory.is_dir():
 57 |             logger.error(f"Directory not found or not a directory: {directory}")
 58 |             return
 59 | 
 60 |         if index_state is not None:
 61 |             index_state.set_status(IndexStatus.INDEXING)
 62 | 
 63 |         file_paths = list(chain(
 64 |             directory.glob("*.txt"),
 65 |             directory.glob("*/*.txt")  # katana stores offsite assets under hostname
 66 |         ))
 67 | 
 68 |         with closing(connection.cursor()) as cursor:
 69 |             for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
 70 |                 if index_state is not None and index_state.is_timeout():
 71 |                     index_state.set_status(IndexStatus.PARTIAL)
 72 |                     return
 73 | 
 74 |                 batch_file_paths: list[Path] = file_paths[i:i+INDEXED_BATCH_SIZE]
 75 |                 batch_file_contents = BaseManager.read_files(batch_file_paths)
 76 |                 batch_insert_resource_results: list[ResourceResult] = []
 77 |                 for file_path, content in batch_file_contents.items():
 78 |                     # avoid readme in repo, katana crawl files should be named 9080ef8...
 79 |                     if file_path.name.lower().endswith("readme.txt"):
 80 |                         continue
 81 |                     try:
 82 |                         record = self._prepare_katana_record(file_path, site_id, content)
 83 |                         if record:
 84 |                             batch_insert_resource_results.append(record)
 85 |                             if index_state is not None:
 86 |                                 index_state.increment_processed()
 87 |                     except Exception as ex:
 88 |                         logger.error(f"Error processing file {file_path}: {ex}")
 89 | 
 90 |                 self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
 91 | 
 92 |             if index_state is not None and index_state.status == IndexStatus.INDEXING:
 93 |                 index_state.set_status(IndexStatus.COMPLETE)
 94 | 
 95 |     def _prepare_katana_record(self, file_path: Path, site_id: int, content: str) -> ResourceResult | None:
 96 |         """
 97 |         Prepare a record for batch insertion.
 98 | 
 99 |         Args:
100 |             file_path: path to the Katana crawl file record
101 |             site_id: ID for the site
102 |             content: loaded file content
103 | 
104 |         Returns:
105 |             ResourceResult object ready for insertion, or None if processing fails
106 |         """
107 |         if file_path.is_file():
108 |             file_stat = file_path.stat()
109 |             # HTTP header modified mostly useless, change my mind
110 |             file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
111 |             file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
112 |         else:
113 |             file_created = None
114 |             file_modified = None
115 | 
116 |         # crawl format: <url>\n\n<request>\n\n<headers>...<response>
117 |         parts: list[str] = content.split("\n\n", 2)
118 |         if len(parts) < 3:
119 |             logger.warning(f"Invalid HTTP text format in file {file_path}")
120 |             return None
121 | 
122 |         url: str = parts[0].strip()
123 |         response_data: str = parts[2].strip()
124 | 
125 |         try:
126 |             response_parts: list[str] = response_data.split("\n\n", 1)
127 |             headers: str = response_parts[0].strip()
128 |             body: str = response_parts[1].strip() if len(response_parts) > 1 else ""
129 | 
130 |             if "Transfer-Encoding: chunked" in headers:
131 |                 body = body.split("\n", 1)[1].strip()   # remove hex prefix
132 |                 body = body.rsplit("\n0", 1)[0].strip() # remove trailing "0" terminator
133 | 
134 |             # status from the first line of headers
135 |             status_match: str = KATANA_REGEX_HTTP_STATUS.search(headers.split("\n", 2)[0])
136 |             status_code: int = int(status_match.group(1)) if status_match else 0
137 | 
138 |             content_type_match = KATANA_REGEX_CONTENT_TYPE.search(headers)
139 |             content_type = content_type_match.group(1).strip() if content_type_match else ""
140 |             resource_type = self._determine_resource_type(content_type)
141 |             content_size = len(body)
142 |             resource_id = BaseManager.string_to_id(url)
143 | 
144 |             return ResourceResult(
145 |                 id=resource_id,
146 |                 site=site_id,
147 |                 created=file_created,
148 |                 modified=file_modified,
149 |                 url=url,
150 |                 type=resource_type,
151 |                 headers=headers,
152 |                 content=body if self._is_text_content(content_type) else None,
153 |                 status=status_code,
154 |                 size=content_size,
155 |                 time=0  # time not available in file or Katana index
156 |             )
157 | 
158 |         except Exception as ex:
159 |             logger.error(f"Error processing HTTP response in file {file_path}: {ex}")
160 |             return None
161 | 
162 | manager: KatanaManager = KatanaManager()
163 | 
164 | def get_sites(
165 |         datasrc: Path,
166 |         ids: list[int] | None = None,
167 |         fields: list[str] | None = None
168 |     ) -> list[SiteResult]:
169 |     """
170 |     List site directories in the datasrc directory as sites.
171 | 
172 |     Args:
173 |         datasrc: path to the directory containing site subdirectories
174 |         ids: optional list of site IDs to filter by
175 |         fields: optional list of fields to include in the response
176 | 
177 |     Returns:
178 |         List of SiteResult objects, one for each site directory
179 | 
180 |     Notes:
181 |         Returns an empty list if the datasrc directory doesn't exist.
182 |     """
183 |     return manager.get_sites_for_directories(datasrc, ids, fields)
184 | 
185 | def get_resources(
186 |     datasrc: Path,
187 |     ids: list[int] | None = None,
188 |     sites: list[int] | None = None,
189 |     query: str = "",
190 |     types: list[ResourceResultType] | None = None,
191 |     fields: list[str] | None = None,
192 |     statuses: list[int] | None = None,
193 |     sort: str | None = None,
194 |     limit: int = RESOURCES_LIMIT_DEFAULT,
195 |     offset: int = 0,
196 | ) -> tuple[list[ResourceResult], int, IndexState]:
197 |     """
198 |     Get resources from wget directories using in-memory SQLite.
199 | 
200 |     Args:
201 |         datasrc: path to the directory containing wget captures
202 |         ids: optional list of resource IDs to filter by
203 |         sites: optional list of site IDs to filter by
204 |         query: search query string
205 |         types: optional list of resource types to filter by
206 |         fields: optional list of fields to include in response
207 |         statuses: optional list of HTTP status codes to filter by
208 |         sort: sort order for results
209 |         limit: maximum number of results to return
210 |         offset: number of results to skip for pagination
211 | 
212 |     Returns:
213 |         Tuple of (list of ResourceResult objects, total count)
214 |     """
215 |     sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
216 |     assert sites_results, "At least one site is required to search"
217 |     site_paths = [site.path for site in sites_results]
218 |     sites_group = SitesGroup(datasrc, sites, site_paths)
219 |     return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
220 | 
```

--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.templates.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="./">
  5 | <head>
  6 |   <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
  7 | 
  8 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  9 |   <title>mcp_server_webcrawl.templates package &mdash; mcp-server-webcrawl  documentation</title>
 10 |       <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
 11 |       <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
 12 | 
 13 |   
 14 |       <script src="_static/jquery.js?v=5d32c60e"></script>
 15 |       <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 16 |       <script src="_static/documentation_options.js?v=5929fcd5"></script>
 17 |       <script src="_static/doctools.js?v=888ff710"></script>
 18 |       <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 19 |     <script src="_static/js/theme.js"></script>
 20 |     <link rel="index" title="Index" href="genindex.html" />
 21 |     <link rel="search" title="Search" href="search.html" />
 22 |     <link rel="next" title="mcp_server_webcrawl.utils package" href="mcp_server_webcrawl.utils.html" />
 23 |     <link rel="prev" title="mcp_server_webcrawl.models package" href="mcp_server_webcrawl.models.html" /> 
 24 | </head>
 25 | 
 26 | <body class="wy-body-for-nav"> 
 27 |   <div class="wy-grid-for-nav">
 28 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 29 |       <div class="wy-side-scroll">
 30 |         <div class="wy-side-nav-search" >
 31 | 
 32 |           
 33 |           
 34 |           <a href="index.html" class="icon icon-home">
 35 |             mcp-server-webcrawl
 36 |           </a>
 37 | <div role="search">
 38 |   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
 39 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 40 |     <input type="hidden" name="check_keywords" value="yes" />
 41 |     <input type="hidden" name="area" value="default" />
 42 |   </form>
 43 | </div>
 44 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 45 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 46 | <ul class="current">
 47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
 50 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
 51 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
 52 | <li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
 53 | <li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
 54 | </ul>
 55 | </li>
 56 | </ul>
 57 | 
 58 |         </div>
 59 |       </div>
 60 |     </nav>
 61 | 
 62 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 63 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 64 |           <a href="index.html">mcp-server-webcrawl</a>
 65 |       </nav>
 66 | 
 67 |       <div class="wy-nav-content">
 68 |         <div class="rst-content">
 69 |           <div role="navigation" aria-label="Page navigation">
 70 |   <ul class="wy-breadcrumbs">
 71 |       <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
 72 |           <li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
 73 |           <li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
 74 |       <li class="breadcrumb-item active">mcp_server_webcrawl.templates package</li>
 75 |       <li class="wy-breadcrumbs-aside">
 76 |             <a href="_sources/mcp_server_webcrawl.templates.rst.txt" rel="nofollow"> View page source</a>
 77 |       </li>
 78 |   </ul>
 79 |   <hr/>
 80 | </div>
 81 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 82 |            <div itemprop="articleBody">
 83 |              
 84 |   <section id="mcp-server-webcrawl-templates-package">
 85 | <h1>mcp_server_webcrawl.templates package<a class="headerlink" href="#mcp-server-webcrawl-templates-package" title="Link to this heading"></a></h1>
 86 | <section id="submodules">
 87 | <h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
 88 | </section>
 89 | <section id="module-mcp_server_webcrawl.templates.tests">
 90 | <span id="mcp-server-webcrawl-templates-tests-module"></span><h2>mcp_server_webcrawl.templates.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.templates.tests" title="Link to this heading"></a></h2>
 91 | <dl class="py class">
 92 | <dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests">
 93 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">TemplateTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests" title="Link to this definition"></a></dt>
 94 | <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">TestCase</span></code></p>
 95 | <p>Test suite for the custom HTML to markdown converter.
 96 | Why custom? It’s a bit faster, that is the only reason.
 97 | Maximum load is 100 transforms (1 per result for a max result
 98 | of 100), so speed matters. A default set is 20.
 99 | This converter does a few things differently to tailor to LLM
100 | interaction.
101 | * aggressively removes images (html2text selectively renders)
102 | * links with block decendents will render like a &lt;p&gt;</p>
103 | <blockquote>
104 | <div><p>(html2text treats as &lt;a&gt;&lt;br&gt;)</p>
105 | </div></blockquote>
106 | <p>Create an instance of the class that will use the named test
107 | method when executed. Raises a ValueError if the instance does
108 | not have a method with the specified name.</p>
109 | <dl class="py method">
110 | <dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests.setUp">
111 | <span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests.setUp" title="Link to this definition"></a></dt>
112 | <dd><p>Set up the test environment with fixture data.</p>
113 | </dd></dl>
114 | 
115 | <dl class="py method">
116 | <dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html">
117 | <span class="sig-name descname"><span class="pre">test_core_html</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests.test_core_html"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html" title="Link to this definition"></a></dt>
118 | <dd></dd></dl>
119 | 
120 | </dd></dl>
121 | 
122 | </section>
123 | <section id="module-mcp_server_webcrawl.templates">
124 | <span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.templates" title="Link to this heading"></a></h2>
125 | </section>
126 | </section>
127 | 
128 | 
129 |            </div>
130 |           </div>
131 |           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
132 |         <a href="mcp_server_webcrawl.models.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.models package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
133 |         <a href="mcp_server_webcrawl.utils.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.utils package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
134 |     </div>
135 | 
136 |   <hr/>
137 | 
138 |   <div role="contentinfo">
139 |     <p>&#169; Copyright 2025, pragmar.</p>
140 |   </div>
141 | 
142 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
143 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
144 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
145 |    
146 | 
147 | </footer>
148 |         </div>
149 |       </div>
150 |     </section>
151 |   </div>
152 |   <script>
153 |       jQuery(function () {
154 |           SphinxRtdTheme.Navigation.enable(true);
155 |       });
156 |   </script> 
157 | 
158 | </body>
159 | </html>
```

--------------------------------------------------------------------------------
/docs/_modules/index.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>Overview: module code &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../genindex.html" />
 20 |     <link rel="search" title="Search" href="../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../interactive.html">Interactive Mode</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
 50 | </ul>
 51 | 
 52 |         </div>
 53 |       </div>
 54 |     </nav>
 55 | 
 56 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 57 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 58 |           <a href="../index.html">mcp-server-webcrawl</a>
 59 |       </nav>
 60 | 
 61 |       <div class="wy-nav-content">
 62 |         <div class="rst-content">
 63 |           <div role="navigation" aria-label="Page navigation">
 64 |   <ul class="wy-breadcrumbs">
 65 |       <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
 66 |       <li class="breadcrumb-item active">Overview: module code</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <h1>All modules for which code is available</h1>
 76 | <ul><li><a href="mcp_server_webcrawl/crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 77 | <ul><li><a href="mcp_server_webcrawl/crawlers/archivebox/adapter.html">mcp_server_webcrawl.crawlers.archivebox.adapter</a></li>
 78 | <li><a href="mcp_server_webcrawl/crawlers/archivebox/crawler.html">mcp_server_webcrawl.crawlers.archivebox.crawler</a></li>
 79 | <li><a href="mcp_server_webcrawl/crawlers/archivebox/tests.html">mcp_server_webcrawl.crawlers.archivebox.tests</a></li>
 80 | <li><a href="mcp_server_webcrawl/crawlers/base/adapter.html">mcp_server_webcrawl.crawlers.base.adapter</a></li>
 81 | <li><a href="mcp_server_webcrawl/crawlers/base/api.html">mcp_server_webcrawl.crawlers.base.api</a></li>
 82 | <li><a href="mcp_server_webcrawl/crawlers/base/crawler.html">mcp_server_webcrawl.crawlers.base.crawler</a></li>
 83 | <li><a href="mcp_server_webcrawl/crawlers/base/indexed.html">mcp_server_webcrawl.crawlers.base.indexed</a></li>
 84 | <li><a href="mcp_server_webcrawl/crawlers/base/tests.html">mcp_server_webcrawl.crawlers.base.tests</a></li>
 85 | <li><a href="mcp_server_webcrawl/crawlers/httrack/adapter.html">mcp_server_webcrawl.crawlers.httrack.adapter</a></li>
 86 | <li><a href="mcp_server_webcrawl/crawlers/httrack/crawler.html">mcp_server_webcrawl.crawlers.httrack.crawler</a></li>
 87 | <li><a href="mcp_server_webcrawl/crawlers/httrack/tests.html">mcp_server_webcrawl.crawlers.httrack.tests</a></li>
 88 | <li><a href="mcp_server_webcrawl/crawlers/interrobot/adapter.html">mcp_server_webcrawl.crawlers.interrobot.adapter</a></li>
 89 | <li><a href="mcp_server_webcrawl/crawlers/interrobot/crawler.html">mcp_server_webcrawl.crawlers.interrobot.crawler</a></li>
 90 | <li><a href="mcp_server_webcrawl/crawlers/interrobot/tests.html">mcp_server_webcrawl.crawlers.interrobot.tests</a></li>
 91 | <li><a href="mcp_server_webcrawl/crawlers/katana/adapter.html">mcp_server_webcrawl.crawlers.katana.adapter</a></li>
 92 | <li><a href="mcp_server_webcrawl/crawlers/katana/crawler.html">mcp_server_webcrawl.crawlers.katana.crawler</a></li>
 93 | <li><a href="mcp_server_webcrawl/crawlers/katana/tests.html">mcp_server_webcrawl.crawlers.katana.tests</a></li>
 94 | <li><a href="mcp_server_webcrawl/crawlers/siteone/adapter.html">mcp_server_webcrawl.crawlers.siteone.adapter</a></li>
 95 | <li><a href="mcp_server_webcrawl/crawlers/siteone/crawler.html">mcp_server_webcrawl.crawlers.siteone.crawler</a></li>
 96 | <li><a href="mcp_server_webcrawl/crawlers/siteone/tests.html">mcp_server_webcrawl.crawlers.siteone.tests</a></li>
 97 | <li><a href="mcp_server_webcrawl/crawlers/warc/adapter.html">mcp_server_webcrawl.crawlers.warc.adapter</a></li>
 98 | <li><a href="mcp_server_webcrawl/crawlers/warc/crawler.html">mcp_server_webcrawl.crawlers.warc.crawler</a></li>
 99 | <li><a href="mcp_server_webcrawl/crawlers/warc/tests.html">mcp_server_webcrawl.crawlers.warc.tests</a></li>
100 | <li><a href="mcp_server_webcrawl/crawlers/wget/adapter.html">mcp_server_webcrawl.crawlers.wget.adapter</a></li>
101 | <li><a href="mcp_server_webcrawl/crawlers/wget/crawler.html">mcp_server_webcrawl.crawlers.wget.crawler</a></li>
102 | <li><a href="mcp_server_webcrawl/crawlers/wget/tests.html">mcp_server_webcrawl.crawlers.wget.tests</a></li>
103 | </ul><li><a href="mcp_server_webcrawl/extras/markdown.html">mcp_server_webcrawl.extras.markdown</a></li>
104 | <li><a href="mcp_server_webcrawl/extras/regex.html">mcp_server_webcrawl.extras.regex</a></li>
105 | <li><a href="mcp_server_webcrawl/extras/snippets.html">mcp_server_webcrawl.extras.snippets</a></li>
106 | <li><a href="mcp_server_webcrawl/extras/thumbnails.html">mcp_server_webcrawl.extras.thumbnails</a></li>
107 | <li><a href="mcp_server_webcrawl/extras/xpath.html">mcp_server_webcrawl.extras.xpath</a></li>
108 | <li><a href="mcp_server_webcrawl/interactive/highlights.html">mcp_server_webcrawl.interactive.highlights</a></li>
109 | <li><a href="mcp_server_webcrawl/interactive/search.html">mcp_server_webcrawl.interactive.search</a></li>
110 | <li><a href="mcp_server_webcrawl/interactive/session.html">mcp_server_webcrawl.interactive.session</a></li>
111 | <li><a href="mcp_server_webcrawl/interactive/ui.html">mcp_server_webcrawl.interactive.ui</a></li>
112 | <li><a href="mcp_server_webcrawl/main.html">mcp_server_webcrawl.main</a></li>
113 | <li><a href="mcp_server_webcrawl/models/resources.html">mcp_server_webcrawl.models.resources</a></li>
114 | <li><a href="mcp_server_webcrawl/models/sites.html">mcp_server_webcrawl.models.sites</a></li>
115 | <li><a href="mcp_server_webcrawl/templates/tests.html">mcp_server_webcrawl.templates.tests</a></li>
116 | <li><a href="mcp_server_webcrawl/utils.html">mcp_server_webcrawl.utils</a></li>
117 | <ul><li><a href="mcp_server_webcrawl/utils/cli.html">mcp_server_webcrawl.utils.cli</a></li>
118 | <li><a href="mcp_server_webcrawl/utils/logger.html">mcp_server_webcrawl.utils.logger</a></li>
119 | <li><a href="mcp_server_webcrawl/utils/server.html">mcp_server_webcrawl.utils.server</a></li>
120 | <li><a href="mcp_server_webcrawl/utils/tools.html">mcp_server_webcrawl.utils.tools</a></li>
121 | </ul><li><a href="namedtuple_InputRadioState.html">namedtuple_InputRadioState</a></li>
122 | </ul>
123 | 
124 |            </div>
125 |           </div>
126 |           <footer>
127 | 
128 |   <hr/>
129 | 
130 |   <div role="contentinfo">
131 |     <p>&#169; Copyright 2025, pragmar.</p>
132 |   </div>
133 | 
134 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
135 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
136 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
137 |    
138 | 
139 | </footer>
140 |         </div>
141 |       </div>
142 |     </section>
143 |   </div>
144 |   <script>
145 |       jQuery(function () {
146 |           SphinxRtdTheme.Navigation.enable(true);
147 |       });
148 |   </script> 
149 | 
150 | </body>
151 | </html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/tests.py:
--------------------------------------------------------------------------------

```python
  1 | from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler
  2 | from mcp_server_webcrawl.crawlers.httrack.adapter import HtTrackManager
  3 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
  4 | from mcp_server_webcrawl.crawlers import get_fixture_directory
  5 | from mcp_server_webcrawl.utils.logger import get_logger
  6 | 
  7 | logger = get_logger()
  8 | 
  9 | # Calculate using same hash function as adapter
 10 | EXAMPLE_SITE_ID = HtTrackManager.string_to_id("example")
 11 | PRAGMAR_SITE_ID = HtTrackManager.string_to_id("pragmar")
 12 | 
 13 | class HtTrackTests(BaseCrawlerTests):
 14 |     """
 15 |     Test suite for the HTTrack crawler implementation.
 16 |     Uses all wrapped test methods from BaseCrawlerTests plus HTTrack-specific features.
 17 |     """
 18 | 
 19 |     def setUp(self):
 20 |         """
 21 |         Set up the test environment with fixture data.
 22 |         """
 23 |         super().setUp()
 24 |         self._datasrc = get_fixture_directory() / "httrack"
 25 | 
 26 |     def test_httrack_pulse(self):
 27 |         """
 28 |         Test basic crawler initialization.
 29 |         """
 30 |         crawler = HtTrackCrawler(self._datasrc)
 31 |         self.assertIsNotNone(crawler)
 32 |         self.assertTrue(self._datasrc.is_dir())
 33 | 
 34 |     def test_httrack_sites(self):
 35 |         """
 36 |         Test site retrieval API functionality.
 37 |         """
 38 |         crawler = HtTrackCrawler(self._datasrc)
 39 |         self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
 40 | 
 41 |     def test_httrack_search(self):
 42 |         """
 43 |         Test boolean search functionality
 44 |         """
 45 |         crawler = HtTrackCrawler(self._datasrc)
 46 |         self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
 47 |         pass
 48 | 
 49 |     def test_httrack_resources(self):
 50 |         """
 51 |         Test resource retrieval API functionality with various arguments.
 52 |         """
 53 |         crawler = HtTrackCrawler(self._datasrc)
 54 |         self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
 55 | 
 56 |     def test_httrack_images(self):
 57 |         """
 58 |         Test HTTrack image handling and thumbnails.
 59 |         """
 60 |         crawler = HtTrackCrawler(self._datasrc)
 61 |         self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
 62 | 
 63 |     def test_httrack_sorts(self):
 64 |         """
 65 |         Test random sort functionality using the sort argument.
 66 |         """
 67 |         crawler = HtTrackCrawler(self._datasrc)
 68 |         self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
 69 | 
 70 |     def test_httrack_content_parsing(self):
 71 |         """
 72 |         Test content type detection and parsing.
 73 |         """
 74 |         crawler = HtTrackCrawler(self._datasrc)
 75 |         self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
 76 | 
 77 |     def test_httrack_tokenizer(self):
 78 |         """
 79 |         Test HTTrack-specific tokenizer functionality for hyphenated terms.
 80 |         """
 81 |         crawler = HtTrackCrawler(self._datasrc)
 82 |         self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
 83 | 
 84 |     def test_httrack_log_parsing_features(self):
 85 |         """
 86 |         Test HTTrack-specific features related to hts-log.txt parsing.
 87 |         """
 88 |         crawler = HtTrackCrawler(self._datasrc)
 89 | 
 90 |         # Test that 404 errors from log are properly indexed
 91 |         error_resources = crawler.get_resources_api(
 92 |             sites=[PRAGMAR_SITE_ID],
 93 |             query="status: 404"
 94 |         )
 95 |         if error_resources.total > 0:
 96 |             for resource in error_resources._results:
 97 |                 self.assertEqual(resource.status, 404, "404 status should be preserved from log parsing")
 98 | 
 99 |         # Test that redirects are properly indexed
100 |         redirect_resources = crawler.get_resources_api(
101 |             sites=[PRAGMAR_SITE_ID],
102 |             query="status: 302"
103 |         )
104 |         if redirect_resources.total > 0:
105 |             for resource in redirect_resources._results:
106 |                 self.assertEqual(resource.status, 302, "Redirect status should be detected from log")
107 | 
108 |         # Test successful resources default to 200
109 |         success_resources = crawler.get_resources_api(
110 |             sites=[PRAGMAR_SITE_ID],
111 |             query="status: 200",
112 |             limit=5
113 |         )
114 |         self.assertTrue(success_resources.total > 0, "Should have successful resources with status 200")
115 |         for resource in success_resources._results:
116 |             self.assertEqual(resource.status, 200)
117 | 
118 |     def test_httrack_url_reconstruction(self):
119 |         """
120 |         Test HTTrack URL reconstruction from project and domain structure.
121 |         """
122 |         crawler = HtTrackCrawler(self._datasrc)
123 | 
124 |         # Get all resources to test URL patterns
125 |         all_resources = crawler.get_resources_api(
126 |             sites=[PRAGMAR_SITE_ID],
127 |             limit=10
128 |         )
129 |         self.assertTrue(all_resources.total > 0, "Should have resources with reconstructed URLs")
130 | 
131 |         for resource in all_resources._results:
132 |             # URLs should be properly formatted
133 |             self.assertTrue(resource.url.startswith("https://"),
134 |                           f"URL should start with https://: {resource.url}")
135 | 
136 |             # URLs should not contain file system artifacts
137 |             self.assertNotIn("\\", resource.url, "URLs should not contain backslashes")
138 |             self.assertNotIn("hts-", resource.url, "URLs should not contain HTTrack artifacts")
139 | 
140 |     def test_httrack_domain_detection(self):
141 |         """
142 |         Test HTTrack domain directory detection and multi-domain handling.
143 |         """
144 |         crawler = HtTrackCrawler(self._datasrc)
145 |         sites_result = crawler.get_sites_api()
146 |         self.assertTrue(sites_result.total > 0, "Should detect HTTrack project directories as sites")
147 | 
148 |         specific_site = crawler.get_sites_api(ids=[PRAGMAR_SITE_ID])
149 |         if specific_site.total > 0:
150 |             site_data = specific_site._results[0].to_dict()
151 |             self.assertIn("urls", site_data, "Site should have URLs")
152 |             self.assertTrue(len(site_data["urls"]) > 0, "Site should have at least one valid URL")
153 | 
154 |     def test_httrack_file_exclusion(self):
155 |         """
156 |         Test that HTTrack-generated files are properly excluded.
157 |         """
158 |         crawler = HtTrackCrawler(self._datasrc)
159 | 
160 |         # Search for any resources that might be HTTrack artifacts
161 |         all_resources = crawler.get_resources_api(
162 |             sites=[PRAGMAR_SITE_ID],
163 |             query="",
164 |             limit=50
165 |         )
166 | 
167 |         for resource in all_resources._results:
168 |             # Should not find project-level index.html (HTTrack-generated)
169 |             if resource.url.endswith("/index.html"):
170 |                 # This should be domain-level index.html, not project-level
171 |                 self.assertNotEqual(resource.url, "https://pragmar/index.html",
172 |                                   "Should not index project-level HTTrack-generated index.html")
173 | 
174 |             # Should not find hts-log.txt as a resource
175 |             self.assertNotIn("hts-log.txt", resource.url, "Should not index hts-log.txt as resource")
176 |             self.assertNotIn("hts-cache", resource.url, "Should not index hts-cache contents as resources")
177 | 
178 |     def test_httrack_advanced_features(self):
179 |         """
180 |         Test HTTrack-specific advanced features not covered by base tests.
181 |         """
182 |         crawler = HtTrackCrawler(self._datasrc)
183 | 
184 |         # Test field retrieval with HTTrack-specific metadata
185 |         field_resources = crawler.get_resources_api(
186 |             sites=[PRAGMAR_SITE_ID],
187 |             query="type: html",
188 |             fields=["content", "headers", "created", "modified"],
189 |             limit=3
190 |         )
191 | 
192 |         if field_resources.total > 0:
193 |             resource_dict = field_resources._results[0].to_dict()
194 | 
195 |             # Test timestamps from file system
196 |             self.assertIn("created", resource_dict, "Should have created timestamp from file stat")
197 |             self.assertIn("modified", resource_dict, "Should have modified timestamp from file stat")
198 | 
199 |             # Test headers generation
200 |             if "headers" in resource_dict and resource_dict["headers"]:
201 |                 headers = resource_dict["headers"]
202 |                 self.assertIn("Content-Type:", headers, "Should have generated Content-Type header")
203 |                 self.assertIn("Content-Length:", headers, "Should have generated Content-Length header")
204 | 
205 |         # Test that resources have proper size information
206 |         size_resources = crawler.get_resources_api(
207 |             sites=[PRAGMAR_SITE_ID],
208 |             fields=["size"],
209 |             limit=5
210 |         )
211 | 
212 |         if size_resources.total > 0:
213 |             for resource in size_resources._results:
214 |                 resource_dict = resource.to_dict()
215 |                 self.assertIn("size", resource_dict, "Resource should have size field")
216 |                 self.assertGreaterEqual(resource_dict["size"], 0, "Size should be non-negative")
217 | 
218 |     def test_report(self):
219 |         """
220 |         Run test report, save to data directory.
221 |         """
222 |         crawler = HtTrackCrawler(self._datasrc)
223 |         logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "HTTrack"))
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | import sqlite3
  3 | import traceback
  4 | 
  5 | from contextlib import closing
  6 | from logging import Logger
  7 | from pathlib import Path
  8 | from typing import Final
  9 | from urllib.parse import urlparse
 10 | 
 11 | from mcp_server_webcrawl.crawlers.base.adapter import IndexState, IndexStatus, BaseManager, SitesGroup
 12 | from mcp_server_webcrawl.models.resources import ResourceResult, RESOURCES_LIMIT_DEFAULT
 13 | from mcp_server_webcrawl.models.sites import SiteResult, SiteType
 14 | from mcp_server_webcrawl.utils import from_isoformat_zulu
 15 | from mcp_server_webcrawl.utils.logger import get_logger
 16 | 
 17 | # maybe dedupe with near match RESOURCES version
 18 | INTERROBOT_RESOURCE_FIELD_MAPPING: Final[dict[str, str]] = {
 19 |     "id": "ResourcesFullText.Id",
 20 |     "site": "ResourcesFullText.Project",
 21 |     "created": "Resources.Created",
 22 |     "modified": "Resources.Modified",
 23 |     "url": "ResourcesFullText.Url",
 24 |     "status": "ResourcesFullText.Status",
 25 |     "size": "Resources.Size",
 26 |     "type": "ResourcesFullText.Type",
 27 |     "headers": "ResourcesFullText.Headers",
 28 |     "content": "ResourcesFullText.Content",
 29 |     "time": "ResourcesFullText.Time"
 30 | }
 31 | 
 32 | INTERROBOT_SITE_FIELD_REQUIRED: Final[set[str]] = set(["id", "name", "type", "urls"])
 33 | 
 34 | # legit different from default version (extra robots)
 35 | INTERROBOT_SITE_FIELD_MAPPING: Final[dict[str, str]] = {
 36 |     "id": "Project.Id",
 37 |     "name": "Project.Name",
 38 |     "type": "Project.Type",
 39 |     "urls": "Project.Urls",
 40 |     "created": "Project.Created",
 41 |     "modified": "Project.Modified",
 42 | }
 43 | 
 44 | logger: Logger = get_logger()
 45 | 
 46 | class InterroBotManager(BaseManager):
 47 |     """
 48 |     Manages HTTP text files in in-memory SQLite databases.
 49 |     Provides connection pooling and caching for efficient access.
 50 |     """
 51 | 
 52 |     def __init__(self) -> None:
 53 |         """Initialize the HTTP text manager with empty cache and statistics."""
 54 |         super().__init__()
 55 | 
 56 |     def get_connection(self, group: SitesGroup) -> tuple[sqlite3.Connection | None, IndexState]:
 57 |         """
 58 |         Get database connection for sites in the group, creating if needed.
 59 | 
 60 |         Args:
 61 |             group: Group of sites to connect to
 62 | 
 63 |         Returns:
 64 |             Tuple of (SQLite connection to in-memory database with data loaded or None if building,
 65 |                      IndexState associated with this database)
 66 |         """
 67 | 
 68 |         index_state = IndexState()
 69 |         index_state.set_status(IndexStatus.REMOTE)
 70 |         connection: sqlite3.Connection
 71 |         try:
 72 |             # note, responsible for implementing closing() on other side
 73 |             connection = sqlite3.connect(group.datasrc)
 74 |         except sqlite3.Error as ex:
 75 |             logger.error(f"SQLite error reading database: {ex}\n{traceback.format_exc()}")
 76 |         except (FileNotFoundError, PermissionError) as ex:
 77 |             logger.error(f"Database access error: {group.datasrc}\n{traceback.format_exc()}")
 78 |             raise
 79 |         except Exception as ex:
 80 |             logger.error(f"Unexpected error reading database {group.datasrc}: {ex}\n{traceback.format_exc()}")
 81 |             raise
 82 | 
 83 |         return connection, index_state
 84 | 
 85 | manager: InterroBotManager = InterroBotManager()
 86 | 
 87 | def get_sites(datasrc: Path, ids=None, fields=None) -> list[SiteResult]:
 88 |     """
 89 |     Get sites based on the provided parameters.
 90 | 
 91 |     Args:
 92 |         datasrc: path to the database
 93 |         ids: optional list of site IDs
 94 |         fields: list of fields to include in response
 95 | 
 96 |     Returns:
 97 |         List of SiteResult objects
 98 |     """
 99 |     site_fields_required: list[str] = ["id", "name", "type", "urls"]
100 |     site_fields_default: list[str] = site_fields_required + ["created", "modified"]
101 |     site_fields_available: list[str] = list(INTERROBOT_SITE_FIELD_MAPPING.keys())
102 | 
103 |     # build query
104 |     params: dict[str, int | str] = {}
105 | 
106 |     # these inputs are named parameters
107 |     ids_clause: str = ""
108 |     if ids and isinstance(ids, list) and len(ids) > 0:
109 |         placeholders: list[str] = [f":id{i}" for i in range(len(ids))]
110 |         ids_clause: str = f" WHERE Project.Id IN ({','.join(placeholders)})"
111 |         params.update({f"id{i}": id_val for i, id_val in enumerate(ids)})
112 | 
113 |     # these inputs are not parameterized
114 |     # fields will be returned from database, if found in INTERROBOT_SITE_FIELD_MAPPING
115 |     selected_fields = set(site_fields_required)
116 |     if fields and isinstance(fields, list):
117 |         selected_fields.update(f for f in fields if f in site_fields_available)
118 |     else:
119 |         selected_fields.update(site_fields_default)
120 | 
121 |     safe_sql_fields = [INTERROBOT_SITE_FIELD_MAPPING[f] for f in selected_fields]
122 |     assert all(re.match(r"^[A-Za-z\.]+$", field) for field in safe_sql_fields), "Unknown or unsafe field requested"
123 |     safe_sql_fields_joined: str = ", ".join(safe_sql_fields)
124 | 
125 |     statement: str = f"SELECT {safe_sql_fields_joined} FROM Projects AS Project{ids_clause} ORDER BY Project.Name ASC"
126 |     sql_results: list[dict[str, int | str | None]] = []
127 |     try:
128 |         if not statement.strip().upper().startswith("SELECT"):
129 |             logger.error("Unauthorized SQL statement")
130 |             raise ValueError("Only SELECT queries are permitted")
131 | 
132 |         with closing(sqlite3.connect(datasrc)) as conn:
133 |             conn.row_factory = sqlite3.Row
134 |             with closing(conn.cursor()) as cursor:
135 |                 cursor.execute(statement, params or {})
136 |                 sql_results = [{k.lower(): v for k, v in dict(row).items()} for row in cursor.fetchall()]
137 |     except sqlite3.Error as ex:
138 |         logger.error(f"SQLite error reading database: {ex}\n{traceback.format_exc()}")
139 |         return []
140 |     except Exception as ex:
141 |         logger.error(f"Database error: {ex}")
142 |         return []
143 | 
144 |     results: list[SiteResult] = []
145 |     #for row in sql_results:
146 |     #    results.append(SiteResult(
147 |     #        path=datasrc,
148 |     #        id=row.get("id"),
149 |     #        url=row.get("url", ""),
150 |     #        created=from_isoformat_zulu(row.get("created")),
151 |     #        modified=from_isoformat_zulu(row.get("modified")),
152 |     #        robots=row.get("robotstext"),
153 |     #        metadata=None,
154 |     #    ))
155 | 
156 |     for row in sql_results:
157 |         urls_list = __urls_from_text(row.get("urls", ""))
158 |         site_type: SiteType
159 |         db_type = row.get("type")
160 |         if db_type == 1:
161 |             site_type = SiteType.CRAWLED_URL
162 |         elif db_type == 2:
163 |             site_type = SiteType.CRAWLED_LIST
164 |         else:
165 |             site_type = SiteType.UNDEFINED
166 | 
167 |         results.append(SiteResult(
168 |             path=datasrc,
169 |             id=row.get("id"),
170 |             name=row.get("name"),  # NEW: directly from DB
171 |             type=site_type,  # NEW: from DB (needs mapping)
172 |             urls=urls_list,  # CHANGED: split into list
173 |             created=from_isoformat_zulu(row.get("created")),
174 |             modified=from_isoformat_zulu(row.get("modified")),
175 |             robots=None,  # Removed - not in new model
176 |             metadata=None,
177 |         ))
178 | 
179 |     return results
180 | 
181 | def __urls_from_text(urls: str) -> list[str]:
182 |     urls_list = []
183 |     if urls:
184 |         for url in urls.split('\n'):
185 |             url = url.strip()
186 |             if url:
187 |                 try:
188 |                     parsed = urlparse(url)
189 |                     if parsed.scheme:
190 |                         urls_list.append(url)
191 |                 except Exception:
192 |                     continue
193 |     return urls_list
194 | 
195 | def get_resources(
196 |     datasrc: Path,
197 |     sites: list[int] | None = None,
198 |     query: str = "",
199 |     fields: list[str] | None = None,
200 |     sort: str | None = None,
201 |     limit: int = RESOURCES_LIMIT_DEFAULT,
202 |     offset: int = 0,
203 | ) -> tuple[list[ResourceResult], int, IndexState]:
204 |     """
205 |     Get resources from wget directories using in-memory SQLite.
206 | 
207 |     Args:
208 |         datasrc: path to the directory containing wget captures
209 |         sites: optional list of site IDs to filter by
210 |         query: search query string
211 |         fields: optional list of fields to include in response
212 |         sort: sort order for results
213 |         limit: maximum number of results to return
214 |         offset: number of results to skip for pagination
215 | 
216 |     Returns:
217 |         Tuple of (list of ResourceResult objects, total count)
218 |     """
219 |     sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
220 |     assert sites_results, "At least one site is required to search"
221 |     site_paths = [site.path for site in sites_results]
222 |     sites_group = SitesGroup(datasrc, sites, site_paths)
223 | 
224 |     # InterroBot uses ints in place of strings
225 |     swap_values = {
226 |         "type" : {
227 |             "": 0,             # UNDEFINED
228 |             "html": 1,         # PAGE
229 |             "other": 2,        # OTHER (could also be 5 or 12 depending on context)
230 |             "rss": 3,          # FEED
231 |             "iframe": 4,       # FRAME
232 |             "img": 6,          # IMAGE
233 |             "audio": 7,        # AUDIO
234 |             "video": 8,        # VIDEO
235 |             "font": 9,         # FONT
236 |             "style": 10,       # CSS
237 |             "script": 11,      # SCRIPT
238 |             "text": 13,        # TEXT
239 |             "pdf": 14,         # PDF
240 |             "doc": 15          # DOC
241 |         }
242 |     }
243 |     return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset, swap_values)
244 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/snippets.py:
--------------------------------------------------------------------------------

```python
  1 | 
  2 | import re
  3 | import lxml.html
  4 | 
  5 | from lxml import etree
  6 | from lxml.etree import ParserError
  7 | from logging import Logger
  8 | from typing import Final
  9 | 
 10 | from mcp_server_webcrawl.utils.logger import get_logger
 11 | from mcp_server_webcrawl.utils.search import SearchQueryParser
 12 | 
 13 | MAX_SNIPPETS_MATCHED_COUNT: Final[int] = 15
 14 | MAX_SNIPPETS_RETURNED_COUNT: Final[int] = 3
 15 | MAX_SNIPPETS_CONTEXT_SIZE: Final[int] = 48
 16 | 
 17 | __RE_SNIPPET_START_TRIM: Final[re.Pattern] = re.compile(r"^[^\w\[]+")
 18 | __RE_SNIPPET_END_TRIM: Final[re.Pattern] = re.compile(r"[^\w\]]+$")
 19 | 
 20 | logger: Logger = get_logger()
 21 | 
 22 | class SnippetContentExtractor:
 23 |     """
 24 |     lxml-based HTML parser for extracting different types of content from HTML.
 25 |     Content separates into components: text, markup, attributes (values), and comments.
 26 |     These can be prioritized in search so that text is the displayed hit over noisier
 27 |     types.
 28 |     """
 29 |     PRIORITY_ORDER: list[str] = ["url", "document_text", "document_attributes",
 30 |         "document_comments", "headers", "document_markup"]
 31 | 
 32 |     __RE_SPLIT: re.Pattern = re.compile(r"[\s_]+|(?<!\w)-(?!\w)")
 33 |     __RE_WHITESPACE: re.Pattern = re.compile(r"\s+")
 34 |     __MAX_CONTENT_BYTES: int = 2 * 1024 * 1024 # 2MB
 35 | 
 36 |     def __init__(self, url: str, headers: str, content: str):
 37 | 
 38 |         self.__document: lxml.html.HtmlElement | None = None
 39 | 
 40 |         self.url: str = url
 41 |         self.content: str = ""
 42 |         # headers one liner to facilitate snippet
 43 |         self.headers: str = re.sub(r"\s+", " ", headers).strip()
 44 |         self.document_text: str = ""
 45 |         self.document_markup: str = ""
 46 |         self.document_attributes: str = ""
 47 |         self.document_comments: str = ""
 48 | 
 49 |         if len(content) > self.__MAX_CONTENT_BYTES:
 50 |             # ignore large files, slow
 51 |             return
 52 |         else:
 53 |             self.content = content
 54 | 
 55 |         load_success: bool = self.__load_content()
 56 |         if load_success == True:
 57 |             _ = self.__extract()
 58 |         else:
 59 |             self.document_text = self.__normalize_whitespace(self.content)
 60 | 
 61 |     def __load_content(self) -> bool:
 62 |         """
 63 |         Load content string into lxml doc.
 64 |         """
 65 | 
 66 |         if not self.content or not self.content.strip():
 67 |             return False
 68 | 
 69 |         try:
 70 |             self.__document = lxml.html.fromstring(self.content.encode("utf-8"))
 71 |             return True
 72 |         except (ParserError, ValueError, UnicodeDecodeError):
 73 |             try:
 74 |                 wrapped_content = f"<html><body>{self.content}</body></html>"
 75 |                 self.__document = lxml.html.fromstring(wrapped_content.encode("utf-8"))
 76 |                 return True
 77 |             except (ParserError, ValueError, UnicodeDecodeError):
 78 |                 return False
 79 | 
 80 |     def __extract(self) -> bool:
 81 |         """
 82 |         Extract content from lxml doc.
 83 |         """
 84 | 
 85 |         if self.__document is None:
 86 |             return False
 87 | 
 88 |         text_values = []
 89 |         markup_values = []
 90 |         attribute_values = []
 91 |         comment_values = []
 92 | 
 93 |         element: lxml.html.HtmlElement | None = None
 94 |         for element in self.__document.iter():
 95 | 
 96 |             # HTML outliers
 97 |             if element.tag is etree.Comment or element.tag is etree.ProcessingInstruction:
 98 |                 if element.text is not None:
 99 |                     comment_values.append(str(element.text.strip()))
100 |                 # avoid regular element text processing
101 |                 continue
102 | 
103 |             if element.tag is etree.Entity or element.tag is etree.CDATA:
104 |                 if element.text is not None:
105 |                     text_values.append(str(element.text.strip()))
106 |                 continue
107 | 
108 |             # HTML tags and attributes
109 |             if element.tag:
110 |                 markup_values.append(element.tag)
111 |                 if element.tag in ("script", "style"):
112 |                     continue
113 | 
114 |             if element.text:
115 |                 text_values.append(element.text.strip())
116 | 
117 |             if element.tail:
118 |                 text_values.append(element.tail.strip())
119 | 
120 |             for attr_name, attr_value in element.attrib.items():
121 |                 markup_values.append(attr_name)
122 |                 if attr_value:
123 |                     values = [v for v in self.__RE_SPLIT.split(attr_value) if v]
124 |                     attribute_values.extend(values)
125 | 
126 |         self.document_text = self.__normalize_values(text_values)
127 |         self.document_markup = self.__normalize_values(markup_values)
128 |         self.document_attributes = self.__normalize_values(attribute_values)
129 |         self.document_comments = self.__normalize_values(comment_values)
130 | 
131 |         return True
132 | 
133 |     def __normalize_values(self, values: list[str]) -> str:
134 |         """
135 |         Concatenate values and normalize whitespace for list of values.
136 |         """
137 |         text = " ".join([value for value in values if value])
138 |         return self.__normalize_whitespace(text)
139 | 
140 |     def __normalize_whitespace(self, text: str) -> str:
141 |         """
142 |         Normalize whitespace using pre-compiled pattern.
143 |         """
144 |         return self.__RE_WHITESPACE.sub(" ", text).strip()
145 | 
146 | def get_snippets(url: str, headers: str, content: str, query: str) -> str | None:
147 |     """
148 |     Takes a query and content, reduces the HTML to text content and extracts hits
149 |     as excerpts of text.
150 | 
151 |     Arguments:
152 |         headers: Header content to search
153 |         content: The HTML or text content to search in
154 |         query: The search query string
155 | 
156 |     Returns:
157 |         A string of snippets with context around matched terms, separated by " ... " or None
158 |     """
159 |     if query in (None, ""):
160 |         return None
161 | 
162 |     url = url or ""
163 |     content = content or ""
164 |     headers = headers or ""
165 | 
166 |     search_terms_parser = SearchQueryParser()
167 |     search_terms: list[str] = search_terms_parser.get_fulltext_terms(query)
168 | 
169 |     if not search_terms:
170 |         return None
171 | 
172 |     snippets = []
173 |     search_terms_parser = SnippetContentExtractor(url, headers, content)
174 | 
175 |     # priority order url, text, attributes, comments, headers, markup
176 |     # most interesting to least, as search hits
177 |     for group_name in search_terms_parser.PRIORITY_ORDER:
178 |         search_group_text = getattr(search_terms_parser, group_name)
179 |         if not search_group_text:
180 |             continue
181 |         group_snippets = find_snippets_in_text(search_group_text, search_terms,
182 |                 max_snippets=MAX_SNIPPETS_MATCHED_COUNT+1, group_name=group_name)
183 |         snippets.extend(group_snippets)
184 |         if len(snippets) > MAX_SNIPPETS_MATCHED_COUNT:
185 |             break
186 | 
187 |     if snippets:
188 |         total_snippets = len(snippets)
189 |         displayed_snippets = snippets[:MAX_SNIPPETS_RETURNED_COUNT]
190 |         result = " ... ".join(displayed_snippets)
191 | 
192 |         if total_snippets > MAX_SNIPPETS_MATCHED_COUNT:
193 |             result += f" ... + >{MAX_SNIPPETS_MATCHED_COUNT} more"
194 |         elif total_snippets > MAX_SNIPPETS_RETURNED_COUNT:
195 |             remaining = total_snippets - MAX_SNIPPETS_RETURNED_COUNT
196 |             result += f" ... +{remaining} more"
197 | 
198 |         return result
199 | 
200 |     return None
201 | 
202 | def find_snippets_in_text(
203 |         text: str,
204 |         terms: list[str],
205 |         max_snippets: int = MAX_SNIPPETS_MATCHED_COUNT,
206 |         group_name: str = "") -> list[str]:
207 |     """
208 |     Searches for whole-word matches of the given terms in the text and extracts
209 |     surrounding context to create highlighted snippets. Each snippet shows the matched term
210 |     in context with markdown-style bold highlighting (**term**).
211 | 
212 |     Args:
213 |         text: The text to search within
214 |         terms: List of search terms to find (case-insensitive, whole words only)
215 |         max_snippets: Maximum number of snippets to return (default: MAX_SNIPPETS_MATCHED_COUNT)
216 |         group_name: Regex group identifier (reserved for future use)
217 | 
218 |     Returns:
219 |         List of unique snippet strings with matched terms highlighted using **bold** markdown.
220 |         Each snippet includes surrounding context up to MAX_SNIPPETS_CONTEXT_SIZE characters
221 |         on each side of the match. Returns empty list if no matches found or invalid input.
222 |     """
223 | 
224 |     if not text or not terms:
225 |         return []
226 | 
227 |     snippets: list[str] = []
228 |     seen_snippets: set[str] = set()
229 |     text_lower: str = text.lower()
230 | 
231 |     escaped_terms = [re.escape(term) for term in terms]
232 |     pattern: str = rf"\b({'|'.join(escaped_terms)})\b"
233 |     highlight_patterns: list[tuple[re.Pattern, str]] = [
234 |         (re.compile(rf"\b({re.escape(term)})\b",
235 |         re.IGNORECASE), term) for term in terms
236 |     ]
237 | 
238 |     matches = list(re.finditer(pattern, text_lower))
239 | 
240 |     for match in matches:
241 |         if len(snippets) >= max_snippets:
242 |             break
243 | 
244 |         context_start: int = max(0, match.start() - MAX_SNIPPETS_CONTEXT_SIZE)
245 |         context_end: int = min(len(text), match.end() + MAX_SNIPPETS_CONTEXT_SIZE)
246 |         if context_start > 0:
247 |             while context_start > 0 and text[context_start].isalnum():
248 |                 context_start -= 1
249 |         if context_end < len(text):
250 |             while context_end < len(text) and text[context_end].isalnum():
251 |                 context_end += 1
252 | 
253 |         snippet: str = text[context_start:context_end].strip()
254 |         snippet = __RE_SNIPPET_START_TRIM.sub("", snippet)
255 |         snippet = __RE_SNIPPET_END_TRIM.sub("", snippet)
256 |         highlighted_snippet: str = snippet
257 | 
258 |         for pattern, _ in highlight_patterns:
259 |             highlighted_snippet = pattern.sub(r"**\1**", highlighted_snippet)
260 | 
261 |         if highlighted_snippet and highlighted_snippet not in seen_snippets:
262 |             seen_snippets.add(highlighted_snippet)
263 |             snippets.append(highlighted_snippet)
264 | 
265 |     return snippets
266 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/base.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | import curses
  3 | 
  4 | from abc import abstractmethod
  5 | from typing import TYPE_CHECKING
  6 | 
  7 | from mcp_server_webcrawl import __name__ as module_name, __version__ as module_version
  8 | from mcp_server_webcrawl.interactive.ui import ThemeDefinition, ViewBounds
  9 | from mcp_server_webcrawl.models.resources import ResourceResult
 10 | from mcp_server_webcrawl.interactive.ui import safe_addstr
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from mcp_server_webcrawl.interactive.session import InteractiveSession
 14 | 
 15 | REGEX_DISPLAY_URL_CLEAN = re.compile(r"^https?://|/$")
 16 | OUTER_WIDTH_RIGHT_MARGIN = 1
 17 | 
 18 | LAYOUT_FOOTER_SEPARATOR = " | "
 19 | LAYOUT_FOOTER_SEPARATOR_LENGTH = len(LAYOUT_FOOTER_SEPARATOR)
 20 | MIN_TERMINAL_HEIGHT = 8
 21 | MIN_TERMINAL_WIDTH = 40
 22 | CONTENT_MARGIN = 4
 23 | 
 24 | class BaseCursesView:
 25 |     """
 26 |     Base class for all views with common interface.
 27 |     """
 28 | 
 29 |     def __init__(self, session: 'InteractiveSession'):
 30 |         self.session = session
 31 |         self.bounds = ViewBounds(x=0, y=0, width=0, height=0)
 32 |         self._focused = False
 33 |         self._selected_index: int = 0
 34 | 
 35 |     @property
 36 |     def focused(self) -> bool:
 37 |         return self._focused
 38 | 
 39 |     def set_bounds(self, bounds: ViewBounds):
 40 |         """
 41 |         Set the rendering bounds for this view.
 42 |         
 43 |         Args:
 44 |             bounds: The ViewBounds object defining the drawing area
 45 |         """
 46 |         self.bounds = bounds
 47 | 
 48 |     def set_focused(self, focused: bool):
 49 |         """
 50 |         Set the focus state for this view.
 51 |         
 52 |         Args:
 53 |             focused: True if this view should be focused, False otherwise
 54 |         """
 55 |         self._focused = focused
 56 | 
 57 |     @abstractmethod
 58 |     def render(self, stdscr: curses.window) -> None:
 59 |         """
 60 |         Render the view within its bounds.
 61 |         
 62 |         Args:
 63 |             stdscr: The curses window to render on
 64 |         """
 65 |         pass
 66 | 
 67 |     @abstractmethod
 68 |     def handle_input(self, key: int) -> bool:
 69 |         """
 70 |         Handle input. Return True if consumed, False to pass through.
 71 |         
 72 |         Args:
 73 |             key: The input key code
 74 |             
 75 |         Returns:
 76 |             bool: True if input was consumed, False to pass through
 77 |         """
 78 |         pass
 79 | 
 80 |     def focusable(self) -> bool:
 81 |         """
 82 |         Return True if this view can receive focus.
 83 |         
 84 |         Returns:
 85 |             bool: True if this view can receive focus
 86 |         """
 87 |         return True
 88 | 
 89 |     def draw_outer_footer(self, stdscr: curses.window, text: str) -> None:
 90 |         """
 91 |         Draw context-sensitive help footer with pipe-separated items.
 92 |         
 93 |         Args:
 94 |             stdscr: The curses window to draw on
 95 |             text: The footer text to display (pipe-separated items)
 96 |         """
 97 |         height, width = stdscr.getmaxyx()
 98 |         footer_line: int = height - 1
 99 |         footer_line_text: str = BaseCursesView._get_full_width_line(stdscr)
100 |         outer_theme_pair: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
101 | 
102 |         safe_addstr(stdscr, footer_line, 0, footer_line_text, outer_theme_pair)
103 |         items = [item.strip() for item in text.split(LAYOUT_FOOTER_SEPARATOR)]
104 |         available_width = width - 4 - 2  # 4 for right margin, 2 for left padding
105 | 
106 |         display_text: str = ""
107 |         test_text: str = ""
108 |         test_text_length: int = 0
109 |         for i in range(len(items)):
110 |             test_text = LAYOUT_FOOTER_SEPARATOR.join(items[:i+1])
111 |             test_text_length = len(test_text)
112 |             if test_text_length <= available_width:
113 |                 display_text = test_text
114 |             else:
115 |                 break
116 | 
117 |          # doesn't fit indicator
118 |         display_text_length: int = len(display_text)
119 |         if test_text_length > available_width:
120 |             display_text += f"{(width - display_text_length - 5) * ' '} »"
121 | 
122 |         if display_text:
123 |             outer_header_theme_pair: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
124 |             safe_addstr(stdscr, footer_line, 1, display_text, outer_header_theme_pair)
125 | 
126 |     def draw_outer_header(self, stdscr: curses.window) -> None:
127 |         """
128 |         Draw the inner header for this view section.
129 |         
130 |         Args:
131 |             stdscr: The curses window to draw on
132 |         """
133 |         _, width = stdscr.getmaxyx()
134 |         style: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
135 | 
136 |         full_width_line: str = BaseCursesView._get_full_width_line(stdscr)
137 |         header_label_text: str = f"{module_name} --interactive"
138 |         header_version_text: str = f"v{module_version}"
139 |         header_version_x: int = max(0, width - len(header_version_text) - 2)
140 | 
141 |         safe_addstr(stdscr, 0, 0, full_width_line, style)
142 |         if len(header_label_text) < width - 2:
143 |             safe_addstr(stdscr, 0, 1, header_label_text, style)
144 | 
145 |         if header_version_x > len(header_label_text) + 3:
146 |             safe_addstr(stdscr, 0, header_version_x, header_version_text, style)
147 | 
148 |     def draw_inner_footer(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
149 |         """
150 |         Draw context-sensitive help footer.
151 |         
152 |         Args:
153 |             stdscr: The curses window to draw on
154 |             bounds: The view bounds defining the drawing area
155 |             text: The footer text to display
156 |         """
157 |         footer_y: int = bounds.y + bounds.height - 1
158 |         line_of_whitespace: str = self._get_bounded_line()
159 |         display_text: str = text or ""
160 |         display_text_max: int = len(line_of_whitespace) - 2
161 |         if len(display_text) > display_text_max:
162 |             display_text = f"{display_text[:display_text_max - 1]}…"
163 | 
164 |         line: str = f" {display_text}".ljust(len(line_of_whitespace))
165 |         safe_addstr(stdscr, footer_y, bounds.x, line, self._get_inner_header_style())
166 | 
167 |     def draw_inner_header(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
168 |         """
169 |         Draw the application header with module name and version.
170 |         
171 |         Args:
172 |             stdscr: The curses window to draw on
173 |             bounds: The view bounds defining the drawing area
174 |             text: The header text to display
175 |         """
176 | 
177 |         line_of_whitespace: str = self._get_bounded_line()
178 |         display_text: str = text or ""
179 |         max_text_width: int = len(line_of_whitespace) - 2
180 |         if len(display_text) > max_text_width:
181 |             display_text = f"{display_text[:max_text_width - 1]}…"
182 | 
183 |         line: str = f" {display_text}".ljust(len(line_of_whitespace))
184 |         safe_addstr(stdscr, bounds.y, bounds.x, line, self._get_inner_header_style())
185 | 
186 | 
187 |     @staticmethod
188 |     def _get_full_width_line(stdscr: curses.window) -> str:
189 |         """
190 |         Get a line that fills the terminal width.
191 |         
192 |         Args:
193 |             stdscr: The curses window to get dimensions from
194 |             
195 |         Returns:
196 |             str: A string of spaces filling the terminal width
197 |         """
198 |         _, width = stdscr.getmaxyx()
199 |         return " " * (width - OUTER_WIDTH_RIGHT_MARGIN)
200 | 
201 |     @staticmethod
202 |     def url_for_display(url: str) -> str:
203 |         """
204 |         Remove protocol prefix and trailing slash from URL for display.
205 |         
206 |         Args:
207 |             url: The URL to clean for display
208 |             
209 |         Returns:
210 |             str: The cleaned URL without protocol and trailing slash
211 |         """
212 |         return REGEX_DISPLAY_URL_CLEAN.sub("", url)
213 | 
214 |     @staticmethod
215 |     def humanized_bytes(result: ResourceResult) -> str:
216 |         """
217 |         Convert resource size to human-readable format (B, KB, MB).
218 |         
219 |         Args:
220 |             result: The ResourceResult containing size information
221 |             
222 |         Returns:
223 |             str: Human-readable size string (e.g., "1.5MB", "512KB", "128B")
224 |         """
225 |         display: str = ""
226 |         if result is not None:
227 |             size: int = result.size
228 |             if isinstance(size, int):
229 |                 if size >= 1024 * 1024:
230 |                     display = f"{size/(1024*1024):.1f}MB"
231 |                 elif size >= 1024:
232 |                     display = f"{size/1024:.1f}KB"
233 |                 else:
234 |                     display = f"{size}B"
235 |         return display
236 | 
237 |     def _get_inner_header_style(self) -> int:
238 |         """
239 |         Get the appropriate header style based on focus state.
240 |         
241 |         Returns:
242 |             int: The theme color pair for the header
243 |         """
244 |         if self._focused == True:
245 |             return self.session.get_theme_color_pair(ThemeDefinition.HEADER_ACTIVE)
246 |         else:
247 |             return self.session.get_theme_color_pair(ThemeDefinition.HEADER_INACTIVE)
248 | 
249 |     def _get_input_style(self) -> int:
250 |         """
251 |         Get the appropriate input style based on focus and selection state.
252 |         
253 |         Returns:
254 |             int: The style attributes for input rendering
255 |         """
256 |         if self._focused and self._selected_index == 0:
257 |             return curses.A_REVERSE
258 |         else:
259 |             return self.session.get_theme_color_pair(ThemeDefinition.INACTIVE_QUERY)
260 | 
261 |     def _get_bounded_line(self) -> str:
262 |         """
263 |         Get a line of spaces that fits within the view bounds.
264 |         
265 |         Returns:
266 |             str: A string of spaces matching the view width
267 |         """
268 |         return " " * self.bounds.width
269 | 
270 |     def _renderable(self, stdscr: curses.window) -> bool:
271 |         """
272 |         Check if the view can be rendered within the current terminal bounds.
273 |         
274 |         Args:
275 |             stdscr: The curses window to check dimensions against
276 |             
277 |         Returns:
278 |             bool: True if the view can be rendered, False otherwise
279 |         """
280 |         terminal_height, terminal_width = stdscr.getmaxyx()
281 |         if self.bounds.y >= terminal_height or self.bounds.x >= terminal_width or self.bounds.width <= 0 or self.bounds.height <= 0:
282 |             return False
283 |         return True
284 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/tests.py:
--------------------------------------------------------------------------------

```python
  1 | from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
  2 | from mcp_server_webcrawl.crawlers.archivebox.adapter import ArchiveBoxManager
  3 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
  4 | from mcp_server_webcrawl.crawlers import get_fixture_directory
  5 | from mcp_server_webcrawl.utils.logger import get_logger
  6 | 
  7 | # calculate ids for ArchiveBox working directories using the same hash function as adapter
  8 | EXAMPLE_SITE_ID = ArchiveBoxManager.string_to_id("example")
  9 | PRAGMAR_SITE_ID = ArchiveBoxManager.string_to_id("pragmar")
 10 | 
 11 | logger = get_logger()
 12 | 
 13 | class ArchiveBoxTests(BaseCrawlerTests):
 14 |     """
 15 |     Test suite for the ArchiveBox crawler implementation.
 16 |     Uses wrapped test methods from BaseCrawlerTests adapted for ArchiveBox's multi-instance structure.
 17 |     """
 18 | 
 19 |     def setUp(self):
 20 |         """
 21 |         Set up the test environment with fixture data.
 22 |         """
 23 |         super().setUp()
 24 |         self._datasrc = get_fixture_directory() / "archivebox"
 25 | 
 26 |     def test_archivebox_pulse(self):
 27 |         """
 28 |         Test basic crawler initialization.
 29 |         """
 30 |         crawler = ArchiveBoxCrawler(self._datasrc)
 31 |         self.assertIsNotNone(crawler)
 32 |         self.assertTrue(self._datasrc.is_dir())
 33 | 
 34 |     def test_archivebox_sites(self):
 35 |         """
 36 |         Test site retrieval API functionality.
 37 |         """
 38 |         crawler = ArchiveBoxCrawler(self._datasrc)
 39 | 
 40 |         # should have multiple sites (example and pragmar working directories)
 41 |         sites_json = crawler.get_sites_api()
 42 |         self.assertGreaterEqual(sites_json.total, 2, "ArchiveBox should have multiple working directories as sites")
 43 | 
 44 |         # test pragmar site specifically
 45 |         self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
 46 | 
 47 |     def test_archivebox_search(self):
 48 |         """
 49 |         Test boolean search functionality.
 50 |         """
 51 |         crawler = ArchiveBoxCrawler(self._datasrc)
 52 |         self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
 53 | 
 54 |     def test_pragmar_tokenizer(self):
 55 |         """
 56 |         Test tokenizer search functionality.
 57 |         """
 58 |         crawler = ArchiveBoxCrawler(self._datasrc)
 59 |         self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
 60 | 
 61 |     def test_archivebox_resources(self):
 62 |         """
 63 |         Test resource retrieval API functionality with various parameters.
 64 |         """
 65 |         crawler = ArchiveBoxCrawler(self._datasrc)
 66 |         self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
 67 | 
 68 |     def test_interrobot_images(self):
 69 |         """
 70 |         Test InterroBot-specific image handling and thumbnails.
 71 |         """
 72 |         crawler = ArchiveBoxCrawler(self._datasrc)
 73 |         self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
 74 | 
 75 |     def test_archivebox_sorts(self):
 76 |         """
 77 |         Test random sort functionality using the '?' sort parameter.
 78 |         """
 79 |         crawler = ArchiveBoxCrawler(self._datasrc)
 80 |         self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
 81 | 
 82 |     def test_archivebox_content_parsing(self):
 83 |         """
 84 |         Test content type detection and parsing for ArchiveBox resources.
 85 |         """
 86 |         crawler = ArchiveBoxCrawler(self._datasrc)
 87 |         self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
 88 | 
 89 |     def test_archivebox_url_reconstruction(self):
 90 |         """
 91 |         Test URL reconstruction from ArchiveBox metadata.
 92 |         """
 93 |         crawler = ArchiveBoxCrawler(self._datasrc)
 94 | 
 95 |         url_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=20)
 96 |         self.assertGreater(url_resources.total, 0, "Should have resources with reconstructed URLs")
 97 | 
 98 |         for resource in url_resources._results:
 99 |             # URLs should be valid HTTP/HTTPS (except for archivebox:// fallbacks)
100 |             self.assertTrue(
101 |                 resource.url.startswith(('http://', 'https://', 'archivebox://')),
102 |                 f"URL should have valid scheme: {resource.url}"
103 |             )
104 | 
105 |             # should not end with index.html (stripped during reconstruction)
106 |             self.assertFalse(
107 |                 resource.url.endswith('/index.html'),
108 |                 f"URL should not end with index.html: {resource.url}"
109 |             )
110 | 
111 |     def test_archivebox_deduplication(self):
112 |         """
113 |         Test resource deduplication across timestamped entries.
114 |         """
115 |         crawler = ArchiveBoxCrawler(self._datasrc)
116 | 
117 |         # get all resources from pragmar site
118 |         all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=100)
119 |         self.assertGreater(all_resources.total, 0, "Should have resources")
120 | 
121 |         # check for URL uniqueness (deduplication should ensure unique URLs)
122 |         urls_found = [r.url for r in all_resources._results]
123 |         unique_urls = set(urls_found)
124 | 
125 |         # should have deduplication working (though some URLs might legitimately appear multiple times
126 |         # if they're different resources, like different timestamps of the same page)
127 |         self.assertLessEqual(len(unique_urls), len(urls_found),
128 |                             "URL deduplication should work properly")
129 | 
130 |     def test_archivebox_metadata_parsing(self):
131 |         """
132 |         Test JSON metadata parsing from ArchiveBox files.
133 |         """
134 |         crawler = ArchiveBoxCrawler(self._datasrc)
135 | 
136 |         # get resources with headers from pragmar site
137 |         header_resources = crawler.get_resources_api(
138 |             sites=[PRAGMAR_SITE_ID],
139 |             fields=["headers"],
140 |             limit=10
141 |         )
142 | 
143 |         if header_resources.total > 0:
144 |             headers_found = 0
145 |             for resource in header_resources._results:
146 |                 resource_dict = resource.to_dict()
147 |                 if "headers" in resource_dict and resource_dict["headers"]:
148 |                     headers_found += 1
149 |                     self.assertIn("HTTP/1.0", resource_dict["headers"],
150 |                                 "Headers should contain HTTP status line")
151 | 
152 |             # at least some resources should have parsed headers
153 |             self.assertGreater(headers_found, 0, "Should find resources with parsed headers")
154 | 
155 |     def test_archivebox_timestamped_structure(self):
156 |         """
157 |         Test handling of ArchiveBox's timestamped entry structure.
158 |         """
159 |         crawler = ArchiveBoxCrawler(self._datasrc)
160 | 
161 |         # get resources with timestamps from pragmar site
162 |         timestamp_resources = crawler.get_resources_api(
163 |             sites=[PRAGMAR_SITE_ID],
164 |             fields=["created", "modified"],
165 |             limit=10
166 |         )
167 | 
168 |         self.assertGreater(timestamp_resources.total, 0, "Should have timestamped resources")
169 | 
170 |         for resource in timestamp_resources._results:
171 |             resource_dict = resource.to_dict()
172 | 
173 |             # should have timestamp information
174 |             self.assertIsNotNone(resource_dict.get("created"),
175 |                                 "Should have created timestamp from entry directory")
176 |             self.assertIsNotNone(resource_dict.get("modified"),
177 |                                 "Should have modified timestamp from entry directory")
178 | 
179 |     def test_archivebox_error_resilience(self):
180 |         """
181 |         Test resilience to malformed JSON and missing files.
182 |         """
183 |         crawler = ArchiveBoxCrawler(self._datasrc)
184 | 
185 |         # should continue processing despite any JSON parsing errors
186 |         all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID])
187 | 
188 |         # verify we got some resources despite potential errors
189 |         self.assertGreater(all_resources.total, 0,
190 |                           "Should process entries even with JSON parsing errors")
191 | 
192 |         # verify resources have reasonable defaults
193 |         for resource in all_resources._results:
194 |             self.assertIsNotNone(resource.url, "URL should always be set")
195 |             self.assertIsInstance(resource.status, int, "Status should be integer")
196 |             self.assertGreaterEqual(resource.status, 0, "Status should be non-negative")
197 |             self.assertLessEqual(resource.status, 599, "Status should be valid HTTP status")
198 | 
199 |     def test_archivebox_multi_site(self):
200 |         """
201 |         Test that multiple ArchiveBox working directories are treated as separate sites.
202 |         """
203 |         crawler = ArchiveBoxCrawler(self._datasrc)
204 | 
205 |         # get resources from each site separately
206 |         example_resources = crawler.get_resources_api(sites=[EXAMPLE_SITE_ID], limit=10)
207 |         pragmar_resources = crawler.get_resources_api(
208 |             query="url: pragmar.com",
209 |             sites=[PRAGMAR_SITE_ID],
210 |             limit=10)
211 | 
212 |         # print(example_resources.to_dict())
213 |         # print(pragmar_resources.to_dict())
214 | 
215 |         # both sites should have resources
216 |         self.assertGreater(example_resources.total, 0, "Example site should have resources")
217 |         self.assertGreater(pragmar_resources.total, 0, "Pragmar site should have resources")
218 | 
219 |         # URLs should reflect the appropriate domains
220 |         example_urls = [r.url for r in example_resources._results]
221 |         pragmar_urls = [r.url for r in pragmar_resources._results]
222 | 
223 |         # verify site separation (pragmar resources should be about pragmar.com)
224 |         pragmar_domain_urls = [url for url in pragmar_urls if "pragmar.com" in url]
225 |         self.assertGreater(len(pragmar_domain_urls), 0,
226 |                           "Pragmar site should contain pragmar.com URLs")
227 | 
228 |     def test_report(self):
229 |         """
230 |         Run test report for ArchiveBox archive.
231 |         """
232 |         crawler = ArchiveBoxCrawler(self._datasrc)
233 | 
234 |         # generate report using pragmar site ID
235 |         report = self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "ArchiveBox")
236 |         logger.info(report)
237 | 
238 |         # basic validation that report contains expected content
239 |         self.assertIn("ArchiveBox", report, "Report should mention ArchiveBox")
240 |         self.assertIn("Total pages:", report, "Report should show page counts")
```

--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="./">
  5 | <head>
  6 |   <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
  7 | 
  8 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  9 |   <title>mcp-server-webcrawl &mdash; mcp-server-webcrawl  documentation</title>
 10 |       <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
 11 |       <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
 12 | 
 13 |   
 14 |       <script src="_static/jquery.js?v=5d32c60e"></script>
 15 |       <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 16 |       <script src="_static/documentation_options.js?v=5929fcd5"></script>
 17 |       <script src="_static/doctools.js?v=888ff710"></script>
 18 |       <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 19 |     <script src="_static/js/theme.js"></script>
 20 |     <link rel="index" title="Index" href="genindex.html" />
 21 |     <link rel="search" title="Search" href="search.html" />
 22 |     <link rel="next" title="Installation" href="installation.html" /> 
 23 | </head>
 24 | 
 25 | <body class="wy-body-for-nav"> 
 26 |   <div class="wy-grid-for-nav">
 27 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 28 |       <div class="wy-side-scroll">
 29 |         <div class="wy-side-nav-search" >
 30 | 
 31 |           
 32 |           
 33 |           <a href="#" class="icon icon-home">
 34 |             mcp-server-webcrawl
 35 |           </a>
 36 | <div role="search">
 37 |   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
 38 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 39 |     <input type="hidden" name="check_keywords" value="yes" />
 40 |     <input type="hidden" name="area" value="default" />
 41 |   </form>
 42 | </div>
 43 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 44 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 45 | <ul>
 46 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
 50 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
 51 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
 52 | </ul>
 53 | 
 54 |         </div>
 55 |       </div>
 56 |     </nav>
 57 | 
 58 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 59 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 60 |           <a href="#">mcp-server-webcrawl</a>
 61 |       </nav>
 62 | 
 63 |       <div class="wy-nav-content">
 64 |         <div class="rst-content">
 65 |           <div role="navigation" aria-label="Page navigation">
 66 |   <ul class="wy-breadcrumbs">
 67 |       <li><a href="#" class="icon icon-home" aria-label="Home"></a></li>
 68 |       <li class="breadcrumb-item active">mcp-server-webcrawl</li>
 69 |       <li class="wy-breadcrumbs-aside">
 70 |             <a href="_sources/index.rst.txt" rel="nofollow"> View page source</a>
 71 |       </li>
 72 |   </ul>
 73 |   <hr/>
 74 | </div>
 75 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 76 |            <div itemprop="articleBody">
 77 |              
 78 |   <a class="reference internal image-reference" href="_images/mcpswc.svg"><img alt="mcp-server-webcrawl heading" class="align-center" src="_images/mcpswc.svg" width="100%" /></a>
 79 | <div style="text-align: center; margin-bottom: 2em;">
 80 |   <a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 4px;">Website</a> |
 81 |   <a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 4px;">Github</a> |
 82 |   <a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 4px;">Docs</a> |
 83 |   <a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 4px;">PyPi</a>
 84 | 
 85 | </div><section id="mcp-server-webcrawl">
 86 | <h1>mcp-server-webcrawl<a class="headerlink" href="#mcp-server-webcrawl" title="Link to this heading"></a></h1>
 87 | <p>Advanced search and retrieval for web crawler data. With <strong>mcp-server-webcrawl</strong>, your AI client filters
 88 | and analyzes web content under your direction or autonomously. The server includes a full-text search
 89 | interface with boolean support, and resource filtering by type, HTTP status, and more.</p>
 90 | <p><strong>mcp-server-webcrawl</strong> provides the LLM a complete menu with which to search your web content, and works with
 91 | a variety of web crawlers:</p>
 92 | <table class="docutils align-default" id="id7">
 93 | <caption><span class="caption-text">Supported Crawlers</span><a class="headerlink" href="#id7" title="Link to this table"></a></caption>
 94 | <colgroup>
 95 | <col style="width: 30.0%" />
 96 | <col style="width: 50.0%" />
 97 | <col style="width: 20.0%" />
 98 | </colgroup>
 99 | <thead>
100 | <tr class="row-odd"><th class="head"><p>Crawler/Format</p></th>
101 | <th class="head"><p>Description</p></th>
102 | <th class="head"><p>Setup Guide</p></th>
103 | </tr>
104 | </thead>
105 | <tbody>
106 | <tr class="row-even"><td><p><a class="reference external" href="https://archivebox.io">ArchiveBox</a></p></td>
107 | <td><p>Self-hosted web archiving tool</p></td>
108 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html">Setup Guide</a></p></td>
109 | </tr>
110 | <tr class="row-odd"><td><p><a class="reference external" href="https://www.httrack.com">HTTrack</a></p></td>
111 | <td><p>GUI/CLI website mirroring tool</p></td>
112 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html">Setup Guide</a></p></td>
113 | </tr>
114 | <tr class="row-even"><td><p><a class="reference external" href="https://interro.bot">InterroBot</a></p></td>
115 | <td><p>GUI crawler and analyzer</p></td>
116 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html">Setup Guide</a></p></td>
117 | </tr>
118 | <tr class="row-odd"><td><p><a class="reference external" href="https://github.com/projectdiscovery/katana">Katana</a></p></td>
119 | <td><p>CLI security-focused crawler</p></td>
120 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html">Setup Guide</a></p></td>
121 | </tr>
122 | <tr class="row-even"><td><p><a class="reference external" href="https://crawler.siteone.io">SiteOne</a></p></td>
123 | <td><p>GUI crawler and analyzer</p></td>
124 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html">Setup Guide</a></p></td>
125 | </tr>
126 | <tr class="row-odd"><td><p><a class="reference external" href="https://en.wikipedia.org/wiki/WARC_(file_format)">WARC</a></p></td>
127 | <td><p>Standard web archive format</p></td>
128 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html">Setup Guide</a></p></td>
129 | </tr>
130 | <tr class="row-even"><td><p><a class="reference external" href="https://en.wikipedia.org/wiki/Wget">wget</a></p></td>
131 | <td><p>CLI website mirroring tool</p></td>
132 | <td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html">Setup Guide</a></p></td>
133 | </tr>
134 | </tbody>
135 | </table>
136 | <p><strong>mcp-server-webcrawl</strong> is free and open source, and requires Claude Desktop, Python (&gt;=3.10). It is installed on the command line, via pip install:</p>
137 | <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>mcp-server-webcrawl
138 | </pre></div>
139 | </div>
140 | <iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/Sid-GBxII1o" frameborder="0" allowfullscreen></iframe><div class="toctree-wrapper compound">
141 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
142 | <ul>
143 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
144 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
145 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
146 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
147 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
148 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
149 | </ul>
150 | </div>
151 | <section id="indices-and-tables">
152 | <h2>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Link to this heading"></a></h2>
153 | <ul class="simple">
154 | <li><p><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></p></li>
155 | <li><p><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></p></li>
156 | <li><p><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></p></li>
157 | </ul>
158 | </section>
159 | </section>
160 | 
161 | 
162 |            </div>
163 |           </div>
164 |           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
165 |         <a href="installation.html" class="btn btn-neutral float-right" title="Installation" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
166 |     </div>
167 | 
168 |   <hr/>
169 | 
170 |   <div role="contentinfo">
171 |     <p>&#169; Copyright 2025, pragmar.</p>
172 |   </div>
173 | 
174 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
175 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
176 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
177 |    
178 | 
179 | </footer>
180 |         </div>
181 |       </div>
182 |     </section>
183 |   </div>
184 |   <script>
185 |       jQuery(function () {
186 |           SphinxRtdTheme.Navigation.enable(true);
187 |       });
188 |   </script> 
189 | 
190 | </body>
191 | </html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import email.utils
  2 | import os
  3 | import sqlite3
  4 | import warcio
  5 | 
  6 | from contextlib import closing
  7 | from datetime import datetime, timezone
  8 | from pathlib import Path
  9 | from typing import Final
 10 | from warcio.recordloader import ArcWarcRecord
 11 | 
 12 | from mcp_server_webcrawl.crawlers.base.adapter import (
 13 |     IndexState,
 14 |     IndexStatus,
 15 |     SitesGroup,
 16 |     INDEXED_BATCH_SIZE,
 17 |     INDEXED_WARC_EXTENSIONS,
 18 | )
 19 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
 20 | from mcp_server_webcrawl.models.resources import (
 21 |     ResourceResult,
 22 |     ResourceResultType,
 23 |     RESOURCES_LIMIT_DEFAULT,
 24 | )
 25 | from mcp_server_webcrawl.models.sites import (
 26 |     SiteResult,
 27 |     SiteType,
 28 |     SITES_FIELDS_DEFAULT,
 29 |     SITES_FIELDS_BASE,
 30 | )
 31 | from mcp_server_webcrawl.utils.logger import get_logger
 32 | 
 33 | logger = get_logger()
 34 | 
 35 | 
 36 | class WarcManager(IndexedManager):
 37 |     """
 38 |     Manages WARC file data in in-memory SQLite databases.
 39 |     Provides connection pooling and caching for efficient access.
 40 |     """
 41 | 
 42 |     def __init__(self) -> None:
 43 |         """Initialize the WARC manager with empty cache and statistics."""
 44 |         super().__init__()
 45 | 
 46 |     def _load_site_data(self, connection: sqlite3.Connection, warc_path: Path,
 47 |         site_id: int, index_state: IndexState = None) -> None:
 48 |         """
 49 |         Load a WARC file into the database with batch processing for better performance.
 50 | 
 51 |         Args:
 52 |             connection: SQLite connection
 53 |             warc_path: path to the WARC file
 54 |             site_id: ID for the site
 55 |             index_state: IndexState object for tracking progress
 56 |         """
 57 |         if not warc_path.exists() or not warc_path.is_file():
 58 |             logger.error(f"WARC file not found or not a file: {warc_path}")
 59 |             return
 60 | 
 61 |         with closing(connection.cursor()) as cursor:
 62 |             if index_state is not None:
 63 |                 index_state.set_status(IndexStatus.INDEXING)
 64 |             try:
 65 |                 batch_insert_resource_results: list[ResourceResult] = []
 66 |                 batch_count: int = 0
 67 |                 with open(warc_path, "rb") as stream:
 68 |                     for warc_record in warcio.ArchiveIterator(stream):
 69 | 
 70 |                         if index_state is not None and index_state.is_timeout():
 71 |                             index_state.set_status(IndexStatus.PARTIAL)
 72 |                             # commit current batch and shut it down
 73 |                             if batch_insert_resource_results:
 74 |                                 self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
 75 |                             return
 76 | 
 77 |                         if warc_record is not None and warc_record.rec_type == "response":
 78 |                             resource_result: ResourceResult = self._prepare_warc_record(warc_record, site_id)
 79 |                             if resource_result:
 80 |                                 batch_insert_resource_results.append(resource_result)
 81 |                                 if index_state is not None:
 82 |                                     index_state.increment_processed()
 83 | 
 84 |                                 batch_count += 1
 85 |                                 if batch_count >= INDEXED_BATCH_SIZE:
 86 |                                     self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
 87 |                                     batch_insert_resource_results = []
 88 |                                     batch_count = 0
 89 | 
 90 |                 # batch insert remaining
 91 |                 if batch_insert_resource_results:
 92 |                     self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
 93 | 
 94 |                 if index_state is not None and index_state.status == IndexStatus.INDEXING:
 95 |                     index_state.set_status(IndexStatus.COMPLETE)
 96 | 
 97 |             except Exception as ex:
 98 |                 logger.error(f"Error processing WARC file {warc_path}: {ex}")
 99 |                 if index_state is not None:
100 |                     index_state.set_status(IndexStatus.FAILED)
101 | 
102 |     def _prepare_warc_record(self, record: ArcWarcRecord, site_id: int) -> ResourceResult | None:
103 |         """
104 |         Prepare a WARC record for batch insertion.
105 | 
106 |         Args:
107 |             record: a warcio record object
108 |             site_id: ID for the site
109 | 
110 |         Returns:
111 |             Tuple of values ready for insertion, or None if processing fails
112 |         """
113 |         try:
114 |             url: str = record.rec_headers.get_header("WARC-Target-URI")
115 |             content_type: str = record.http_headers.get_header("Content-Type", "")
116 |             status: int = int(record.http_headers.get_statuscode()) or 200
117 |             resource_type: ResourceResultType = self._determine_resource_type(content_type)
118 |             content: bytes = record.content_stream().read()
119 |             content_size: int = len(content)
120 | 
121 |             if self._is_text_content(content_type):
122 |                 try:
123 |                     content_str: str = content.decode("utf-8")
124 |                 except UnicodeDecodeError:
125 |                     content_str = None
126 |             else:
127 |                 content_str = None
128 | 
129 |             warc_date = record.rec_headers.get_header("WARC-Date")
130 |             if warc_date:
131 |                 try:
132 |                     file_created = datetime.fromisoformat(warc_date.replace('Z', '+00:00'))
133 |                 except ValueError:
134 |                     # Fallback to email date parser
135 |                     try:
136 |                         time_tuple = email.utils.parsedate_tz(warc_date)
137 |                         file_created = datetime.fromtimestamp(email.utils.mktime_tz(time_tuple), tz=timezone.utc)
138 |                     except (ValueError, TypeError):
139 |                         file_created = datetime.now(timezone.utc)
140 |             else:
141 |                 file_created = None # don't pretend it is now, ResourceResult can survive
142 |             file_modified = file_created # like file stat indexes, these are equivalent
143 | 
144 |             result = ResourceResult(
145 |                 id=IndexedManager.string_to_id(url),
146 |                 site=site_id,
147 |                 created=file_created,
148 |                 modified=file_modified,
149 |                 url=url,
150 |                 type=resource_type,
151 |                 status=status,
152 |                 headers=record.http_headers.to_str(),
153 |                 content=content_str,
154 |                 size=content_size,
155 |                 time=0  # time not available
156 |             )
157 |             return result
158 |         except Exception as ex:
159 |             logger.error(f"Error processing WARC record for URL {url if 'url' in locals() else 'unknown'}: {ex}")
160 |             return None
161 | 
162 | manager: WarcManager = WarcManager()
163 | 
164 | def get_sites(
165 |     datasrc: Path,
166 |     ids: list[int] | None = None,
167 |     fields: list[str] | None = None
168 | ) -> list[SiteResult]:
169 |     """
170 |     List WARC files in the datasrc directory as sites.
171 | 
172 |     Args:
173 |         datasrc: path to the directory containing WARC files
174 |         ids: optional list of site IDs to filter by
175 |         fields: list of fields to include in the response
176 | 
177 |     Returns:
178 |         List of SiteResult objects, one for each WARC file
179 |     """
180 |     assert datasrc is not None, f"datasrc not provided ({datasrc})"
181 | 
182 |     # nothing can be done, but don't crash the server either, keep chugging along
183 |     if not datasrc.exists():
184 |         logger.error(f"Directory not found ({datasrc})")
185 |         return []
186 | 
187 |     # determine which fields to include
188 |     selected_fields: set[str] = set(SITES_FIELDS_BASE)
189 |     if fields:
190 |         valid_fields: set[str] = set(SITES_FIELDS_DEFAULT)
191 |         selected_fields.update(f for f in fields if f in valid_fields)
192 |     else:
193 |         selected_fields.update(SITES_FIELDS_DEFAULT)
194 | 
195 |     results: list[SiteResult] = []
196 | 
197 |     files_to_check: list[Path] = []
198 |     for ext in INDEXED_WARC_EXTENSIONS:
199 |         files_to_check.extend(datasrc.glob(f"*{ext}"))
200 | 
201 |     # map of file_id -> file_path for filtering
202 |     file_id_map: dict[int, Path] = {WarcManager.string_to_id(str(os.path.basename(f))): f for f in files_to_check if f is not None}
203 | 
204 |     if ids:
205 |         file_id_map = {id_val: path for id_val, path in file_id_map.items() if id_val in ids}
206 | 
207 | 
208 |     # for site_id, file_path in sorted(file_id_map.items()):
209 |     #     file_stat = file_path.stat()
210 |     #     created_time: datetime = datetime.fromtimestamp(file_stat.st_ctime)
211 |     #     modified_time: datetime = datetime.fromtimestamp(file_stat.st_mtime)
212 |     #     site: SiteResult = SiteResult(
213 |     #         path=file_path,
214 |     #         id=site_id,
215 |     #         url=str(file_path.absolute()),
216 |     #         created=created_time if "created" in selected_fields else None,
217 |     #         modified=modified_time if "modified" in selected_fields else None,
218 |     #     )
219 |     #     results.append(site)
220 | 
221 |     for site_id, file_path in sorted(file_id_map.items()):
222 |         file_stat = file_path.stat()
223 |         created_time: datetime = datetime.fromtimestamp(file_stat.st_ctime)
224 |         modified_time: datetime = datetime.fromtimestamp(file_stat.st_mtime)
225 |         site: SiteResult = SiteResult(
226 |             path=file_path,
227 |             id=site_id,
228 |             name=file_path.name,  # NEW: just the filename
229 |             type=SiteType.CRAWLED_URL,  # NEW: treated as single-site crawl
230 |             urls=[str(file_path.absolute())],  # CHANGED: now a list (file path as the "URL")
231 |             created=created_time if "created" in selected_fields else None,
232 |             modified=modified_time if "modified" in selected_fields else None,
233 |         )
234 |         results.append(site)
235 |     return results
236 | 
237 | def get_resources(
238 |     datasrc: Path,
239 |     sites: list[int] | None = None,
240 |     query: str = "",
241 |     fields: list[str] | None = None,
242 |     sort: str | None = None,
243 |     limit: int = RESOURCES_LIMIT_DEFAULT,
244 |     offset: int = 0,
245 | ) -> tuple[list[ResourceResult], int, IndexState]:
246 |     """
247 |     Get resources from wget directories using in-memory SQLite.
248 | 
249 |     Args:
250 |         datasrc: path to the directory containing wget captures
251 |         sites: optional list of site IDs to filter by
252 |         query: search query string
253 |         fields: optional list of fields to include in response
254 |         sort: sort order for results
255 |         limit: maximum number of results to return
256 |         offset: number of results to skip for pagination
257 | 
258 |     Returns:
259 |         Tuple of (list of ResourceResult objects, total count)
260 |     """
261 |     sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
262 |     assert sites_results, "At least one site is required to search"
263 |     site_paths = [site.path for site in sites_results]
264 |     sites_group = SitesGroup(datasrc, sites, site_paths)
265 |     return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
266 | 
```
Page 3/35FirstPrevNextLast