This is page 8 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl?page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.crawlers.warc.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.warc package — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="mcp_server_webcrawl.crawlers.wget package" href="mcp_server_webcrawl.crawlers.wget.html" />
<link rel="prev" title="mcp_server_webcrawl.crawlers.siteone package" href="mcp_server_webcrawl.crawlers.siteone.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.warc package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/mcp_server_webcrawl.crawlers.warc.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl-crawlers-warc-package">
<h1>mcp_server_webcrawl.crawlers.warc package<a class="headerlink" href="#mcp-server-webcrawl-crawlers-warc-package" title="Link to this heading"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl.crawlers.warc.adapter">
<span id="mcp-server-webcrawl-crawlers-warc-adapter-module"></span><h2>mcp_server_webcrawl.crawlers.warc.adapter module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.warc.adapter" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.adapter.WarcManager">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">WarcManager</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/adapter.html#WarcManager"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.adapter.WarcManager" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager" title="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager"><code class="xref py py-class docutils literal notranslate"><span class="pre">IndexedManager</span></code></a></p>
<p>Manages WARC file data in in-memory SQLite databases.
Provides connection pooling and caching for efficient access.</p>
<p>Initialize the WARC manager with empty cache and statistics.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.adapter.WarcManager.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/adapter.html#WarcManager.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.adapter.WarcManager.__init__" title="Link to this definition"></a></dt>
<dd><p>Initialize the WARC manager with empty cache and statistics.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.adapter.get_sites">
<span class="sig-name descname"><span class="pre">get_sites</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/adapter.html#get_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.adapter.get_sites" title="Link to this definition"></a></dt>
<dd><p>List WARC files in the datasrc directory as sites.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the directory containing WARC files</p></li>
<li><p><strong>ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
<li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – list of fields to include in the response</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>List of SiteResult objects, one for each WARC file</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a>]</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.adapter.get_resources">
<span class="sig-name descname"><span class="pre">get_resources</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sites</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">limit</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/adapter.html#get_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.adapter.get_resources" title="Link to this definition"></a></dt>
<dd><p>Get resources from wget directories using in-memory SQLite.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the directory containing wget captures</p></li>
<li><p><strong>sites</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
<li><p><strong>query</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – search query string</p></li>
<li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – optional list of fields to include in response</p></li>
<li><p><strong>sort</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em> | </em><em>None</em>) – sort order for results</p></li>
<li><p><strong>limit</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – maximum number of results to return</p></li>
<li><p><strong>offset</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – number of results to skip for pagination</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>Tuple of (list of ResourceResult objects, total count)</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.14)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a>], <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a>, <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="mcp_server_webcrawl.crawlers.base.adapter.IndexState"><em>IndexState</em></a>]</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.warc.crawler">
<span id="mcp-server-webcrawl-crawlers-warc-crawler-module"></span><h2>mcp_server_webcrawl.crawlers.warc.crawler module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.warc.crawler" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.crawler.WarcCrawler">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">WarcCrawler</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/crawler.html#WarcCrawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.crawler.WarcCrawler" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler" title="mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler"><code class="xref py py-class docutils literal notranslate"><span class="pre">IndexedCrawler</span></code></a></p>
<p>A crawler implementation for WARC (Web ARChive) files.
Provides functionality for accessing and searching web archive content.</p>
<p>Initialize the WARC crawler with a data source directory.
Supported file types: .txt, .warc, and .warc.gz</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>datasrc</strong> – the input argument as Path, must be a directory containing WARC files</p>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#AssertionError" title="(in Python v3.14)"><strong>AssertionError</strong></a> – If datasrc is None or not a directory</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.crawler.WarcCrawler.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/crawler.html#WarcCrawler.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.crawler.WarcCrawler.__init__" title="Link to this definition"></a></dt>
<dd><p>Initialize the WARC crawler with a data source directory.
Supported file types: .txt, .warc, and .warc.gz</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – the input argument as Path, must be a directory containing WARC files</p>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#AssertionError" title="(in Python v3.14)"><strong>AssertionError</strong></a> – If datasrc is None or not a directory</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.warc.tests">
<span id="mcp-server-webcrawl-crawlers-warc-tests-module"></span><h2>mcp_server_webcrawl.crawlers.warc.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.warc.tests" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">WarcTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests" title="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests"><code class="xref py py-class docutils literal notranslate"><span class="pre">BaseCrawlerTests</span></code></a></p>
<p>Test suite for the WARC crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests.</p>
<p>Create an instance of the class that will use the named test
method when executed. Raises a ValueError if the instance does
not have a method with the specified name.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.setUp">
<span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.setUp" title="Link to this definition"></a></dt>
<dd><p>Set up the test environment with fixture data.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_pulse">
<span class="sig-name descname"><span class="pre">test_warc_pulse</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_warc_pulse"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_pulse" title="Link to this definition"></a></dt>
<dd><p>Test basic crawler initialization.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sites">
<span class="sig-name descname"><span class="pre">test_warc_sites</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_warc_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sites" title="Link to this definition"></a></dt>
<dd><p>Test site retrieval API functionality.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_search">
<span class="sig-name descname"><span class="pre">test_warc_search</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_warc_search"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_search" title="Link to this definition"></a></dt>
<dd><p>Test boolean search functionality</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_resources">
<span class="sig-name descname"><span class="pre">test_warc_resources</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_warc_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_resources" title="Link to this definition"></a></dt>
<dd><p>Test resource retrieval API functionality with various parameters.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sorts">
<span class="sig-name descname"><span class="pre">test_warc_sorts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_warc_sorts"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sorts" title="Link to this definition"></a></dt>
<dd><p>Test random sort functionality using the ‘?’ sort parameter.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_content_parsing">
<span class="sig-name descname"><span class="pre">test_warc_content_parsing</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_warc_content_parsing"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_content_parsing" title="Link to this definition"></a></dt>
<dd><p>Test content type detection and parsing for WARC files.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_report">
<span class="sig-name descname"><span class="pre">test_report</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/warc/tests.html#WarcTests.test_report"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_report" title="Link to this definition"></a></dt>
<dd><p>Run test report, save to data directory.</p>
</dd></dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.warc">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.warc" title="Link to this heading"></a></h2>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="mcp_server_webcrawl.crawlers.siteone.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.crawlers.siteone package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="mcp_server_webcrawl.crawlers.wget.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.crawlers.wget package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.crawlers.wget.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.wget package — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="mcp_server_webcrawl.extras package" href="mcp_server_webcrawl.extras.html" />
<link rel="prev" title="mcp_server_webcrawl.crawlers.warc package" href="mcp_server_webcrawl.crawlers.warc.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.wget package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/mcp_server_webcrawl.crawlers.wget.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl-crawlers-wget-package">
<h1>mcp_server_webcrawl.crawlers.wget package<a class="headerlink" href="#mcp-server-webcrawl-crawlers-wget-package" title="Link to this heading"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl.crawlers.wget.adapter">
<span id="mcp-server-webcrawl-crawlers-wget-adapter-module"></span><h2>mcp_server_webcrawl.crawlers.wget.adapter module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.wget.adapter" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.adapter.WgetManager">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">WgetManager</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/adapter.html#WgetManager"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.adapter.WgetManager" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager" title="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager"><code class="xref py py-class docutils literal notranslate"><span class="pre">IndexedManager</span></code></a></p>
<p>Manages wget directory data in in-memory SQLite databases.
Provides connection pooling and caching for efficient access.</p>
<p>Initialize the wget manager with empty cache and statistics.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.adapter.WgetManager.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/adapter.html#WgetManager.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.adapter.WgetManager.__init__" title="Link to this definition"></a></dt>
<dd><p>Initialize the wget manager with empty cache and statistics.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.adapter.get_sites">
<span class="sig-name descname"><span class="pre">get_sites</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/adapter.html#get_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.adapter.get_sites" title="Link to this definition"></a></dt>
<dd><p>List site directories in the datasrc directory as sites.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the directory containing site subdirectories</p></li>
<li><p><strong>ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
<li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – optional list of fields to include in the response</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>List of SiteResult objects, one for each site directory</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a>]</p>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>Returns an empty list if the datasrc directory doesn’t exist.</p>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.adapter.get_resources">
<span class="sig-name descname"><span class="pre">get_resources</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sites</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">limit</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/adapter.html#get_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.adapter.get_resources" title="Link to this definition"></a></dt>
<dd><p>Get resources from wget directories using in-memory SQLite.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the directory containing wget captures</p></li>
<li><p><strong>sites</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
<li><p><strong>query</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – search query string</p></li>
<li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – optional list of fields to include in response</p></li>
<li><p><strong>sort</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em> | </em><em>None</em>) – sort order for results</p></li>
<li><p><strong>limit</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – maximum number of results to return</p></li>
<li><p><strong>offset</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – number of results to skip for pagination</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>Tuple of (list of ResourceResult objects, total count)</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.14)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a>], <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a>, <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="mcp_server_webcrawl.crawlers.base.adapter.IndexState"><em>IndexState</em></a>]</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.wget.crawler">
<span id="mcp-server-webcrawl-crawlers-wget-crawler-module"></span><h2>mcp_server_webcrawl.crawlers.wget.crawler module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.wget.crawler" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.crawler.WgetCrawler">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">WgetCrawler</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/crawler.html#WgetCrawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.crawler.WgetCrawler" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler" title="mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler"><code class="xref py py-class docutils literal notranslate"><span class="pre">IndexedCrawler</span></code></a></p>
<p>A crawler implementation for wget captured sites.
Provides functionality for accessing and searching web content from wget captures.</p>
<p>Initialize the wget crawler with a data source directory.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>datasrc</strong> – the input argument as Path, it must be a directory containing
wget captures organized as subdirectories</p>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#AssertionError" title="(in Python v3.14)"><strong>AssertionError</strong></a> – If datasrc is None or not a directory</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.crawler.WgetCrawler.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/crawler.html#WgetCrawler.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.crawler.WgetCrawler.__init__" title="Link to this definition"></a></dt>
<dd><p>Initialize the wget crawler with a data source directory.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – the input argument as Path, it must be a directory containing
wget captures organized as subdirectories</p>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#AssertionError" title="(in Python v3.14)"><strong>AssertionError</strong></a> – If datasrc is None or not a directory</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.wget.tests">
<span id="mcp-server-webcrawl-crawlers-wget-tests-module"></span><h2>mcp_server_webcrawl.crawlers.wget.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.wget.tests" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">WgetTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests" title="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests"><code class="xref py py-class docutils literal notranslate"><span class="pre">BaseCrawlerTests</span></code></a></p>
<p>Test suite for the wget crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests.</p>
<p>Create an instance of the class that will use the named test
method when executed. Raises a ValueError if the instance does
not have a method with the specified name.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.setUp">
<span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.setUp" title="Link to this definition"></a></dt>
<dd><p>Set up the test environment with fixture data.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_pulse">
<span class="sig-name descname"><span class="pre">test_wget_pulse</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_wget_pulse"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_pulse" title="Link to this definition"></a></dt>
<dd><p>Test basic crawler initialization.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sites">
<span class="sig-name descname"><span class="pre">test_wget_sites</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_wget_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sites" title="Link to this definition"></a></dt>
<dd><p>Test site retrieval API functionality.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_search">
<span class="sig-name descname"><span class="pre">test_wget_search</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_wget_search"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_search" title="Link to this definition"></a></dt>
<dd><p>Test boolean search functionality</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_resources">
<span class="sig-name descname"><span class="pre">test_wget_resources</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_wget_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_resources" title="Link to this definition"></a></dt>
<dd><p>Test resource retrieval API functionality with various parameters.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sorts">
<span class="sig-name descname"><span class="pre">test_wget_sorts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_wget_sorts"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sorts" title="Link to this definition"></a></dt>
<dd><p>Test random sort functionality using the ‘?’ sort parameter.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_content_parsing">
<span class="sig-name descname"><span class="pre">test_wget_content_parsing</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_wget_content_parsing"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_content_parsing" title="Link to this definition"></a></dt>
<dd><p>Test content type detection and parsing.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_report">
<span class="sig-name descname"><span class="pre">test_report</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/wget/tests.html#WgetTests.test_report"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_report" title="Link to this definition"></a></dt>
<dd><p>Run test report, save to data directory.</p>
</dd></dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.wget">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.wget" title="Link to this heading"></a></h2>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="mcp_server_webcrawl.crawlers.warc.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.crawlers.warc package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="mcp_server_webcrawl.extras.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.extras package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/base/api.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.base.api — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.base.api</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.base.api</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">timezone</span>
<span class="kn">from</span> <span class="nn">time</span> <span class="kn">import</span> <span class="n">time</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Final</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.adapter</span> <span class="kn">import</span> <span class="n">IndexState</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.base</span> <span class="kn">import</span> <span class="n">METADATA_VALUE_TYPE</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.resources</span> <span class="kn">import</span> <span class="n">ResourceResult</span><span class="p">,</span> <span class="n">ResourceResultType</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.sites</span> <span class="kn">import</span> <span class="n">SiteResult</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils</span> <span class="kn">import</span> <span class="n">to_isoformat_zulu</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<span class="n">OVERRIDE_ERROR_MESSAGE</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"BaseCrawler subclasses must implement </span><span class="se">\</span>
<span class="s2">the following methods: handle_list_tools, handle_call_tool"</span>
<div class="viewcode-block" id="BaseJsonApiEncoder">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApiEncoder">[docs]</a>
<span class="k">class</span> <span class="nc">BaseJsonApiEncoder</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">JSONEncoder</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Custom JSON encoder for BaseJsonApi objects and ResourceResultType enums.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="BaseJsonApiEncoder.default">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApiEncoder.default">[docs]</a>
<span class="k">def</span> <span class="nf">default</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Override default encoder to handle custom types.</span>
<span class="sd"> Args:</span>
<span class="sd"> obj: Object to encode</span>
<span class="sd"> Returns:</span>
<span class="sd"> JSON serializable representation of the object</span>
<span class="sd"> """</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">BaseJsonApi</span><span class="p">):</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="vm">__dict__</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="p">):</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">value</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">datetime</span><span class="p">):</span>
<span class="k">return</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">default</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div>
</div>
<div class="viewcode-block" id="BaseJsonApi">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi">[docs]</a>
<span class="k">class</span> <span class="nc">BaseJsonApi</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Base class for JSON API responses.</span>
<span class="sd"> Provides a standardized structure for API responses including metadata,</span>
<span class="sd"> results, and error handling.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="BaseJsonApi.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.__init__">[docs]</a>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">index_state</span><span class="p">:</span> <span class="n">IndexState</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Construct with the arguments of creation (aoc), these will be echoed back in</span>
<span class="sd"> JSON response. This is an object that collapses into json on json dumps. This is</span>
<span class="sd"> done with everything within implementing to_dict.</span>
<span class="sd"> Args:</span>
<span class="sd"> method: API method name</span>
<span class="sd"> args: Dictionary of API arguments</span>
<span class="sd"> index_state: indexing, complete, remote, etc.</span>
<span class="sd"> """</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl</span> <span class="kn">import</span> <span class="n">__version__</span><span class="p">,</span> <span class="vm">__name__</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_start_time</span> <span class="o">=</span> <span class="n">time</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">method</span> <span class="o">=</span> <span class="n">method</span>
<span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span>
<span class="bp">self</span><span class="o">.</span><span class="n">meta_generator</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="vm">__name__</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">__version__</span><span class="si">}</span><span class="s2">)"</span>
<span class="bp">self</span><span class="o">.</span><span class="n">meta_generated</span> <span class="o">=</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">(</span><span class="n">timezone</span><span class="o">.</span><span class="n">utc</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">meta_index</span> <span class="o">=</span> <span class="n">index_state</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span> <span class="k">if</span> <span class="n">index_state</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">SiteResult</span> <span class="o">|</span> <span class="n">ResourceResult</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results_total</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results_offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results_limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_errors</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">total</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Returns the total number of results.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Integer count of total results</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_results_total</span>
<div class="viewcode-block" id="BaseJsonApi.get_results">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.get_results">[docs]</a>
<span class="k">def</span> <span class="nf">get_results</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">list</span><span class="p">[</span><span class="n">SiteResult</span> <span class="o">|</span> <span class="n">ResourceResult</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Returns list of results.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Results of type SiteResult or ResourceResult</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_results</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span></div>
<div class="viewcode-block" id="BaseJsonApi.set_results">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.set_results">[docs]</a>
<span class="k">def</span> <span class="nf">set_results</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">results</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">SiteResult</span> <span class="o">|</span> <span class="n">ResourceResult</span><span class="p">],</span> <span class="n">total</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Set the results of the API response.</span>
<span class="sd"> Args:</span>
<span class="sd"> results: List of result objects</span>
<span class="sd"> total: Total number of results (including those beyond limit)</span>
<span class="sd"> offset: Starting position in the full result set</span>
<span class="sd"> limit: Maximum number of results to include</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results</span> <span class="o">=</span> <span class="n">results</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results_total</span> <span class="o">=</span> <span class="n">total</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results_offset</span> <span class="o">=</span> <span class="n">offset</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_results_limit</span> <span class="o">=</span> <span class="n">limit</span></div>
<div class="viewcode-block" id="BaseJsonApi.append_error">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.append_error">[docs]</a>
<span class="k">def</span> <span class="nf">append_error</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">message</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Add an error to the JSON response, visible to the endpoint LLM.</span>
<span class="sd"> Args:</span>
<span class="sd"> message: Error message to add</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_errors</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">message</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseJsonApi.to_dict">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.to_dict">[docs]</a>
<span class="k">def</span> <span class="nf">to_dict</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Convert the object to a JSON-serializable dictionary.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Dictionary representation of the API response</span>
<span class="sd"> """</span>
<span class="n">response</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"__meta__"</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">"generator"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">meta_generator</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="s2">"generated"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">meta_generated</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="s2">"request"</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">"method"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">method</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="s2">"arguments"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span>
<span class="s2">"time"</span><span class="p">:</span> <span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">_start_time</span><span class="p">,</span>
<span class="p">},</span>
<span class="s2">"results"</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">"total"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_results_total</span><span class="p">,</span>
<span class="s2">"offset"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_results_offset</span><span class="p">,</span>
<span class="s2">"limit"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_results_limit</span><span class="p">,</span>
<span class="p">},</span>
<span class="p">},</span>
<span class="s2">"results"</span><span class="p">:</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">to_forcefield_dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="s2">"fields"</span><span class="p">])</span> <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">r</span><span class="p">,</span> <span class="s2">"to_forcefield_dict"</span><span class="p">)</span> <span class="k">else</span> <span class="n">r</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_results</span><span class="p">]</span>
<span class="p">}</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">meta_index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">response</span><span class="p">[</span><span class="s2">"__meta__"</span><span class="p">][</span><span class="s2">"index"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">meta_index</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_errors</span><span class="p">:</span>
<span class="n">response</span><span class="p">[</span><span class="s2">"__meta__"</span><span class="p">][</span><span class="s2">"errors"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_errors</span>
<span class="k">return</span> <span class="n">response</span></div>
<div class="viewcode-block" id="BaseJsonApi.to_json">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.to_json">[docs]</a>
<span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Return a JSON serializable representation of this object.</span>
<span class="sd"> Returns:</span>
<span class="sd"> JSON string representation of the API response</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_dict</span><span class="p">(),</span> <span class="n">indent</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="bp">cls</span><span class="o">=</span><span class="n">BaseJsonApiEncoder</span><span class="p">)</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/interrobot/tests.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.interrobot.tests — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.interrobot.tests</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.interrobot.tests</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">asyncio</span>
<span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
<span class="kn">from</span> <span class="nn">mcp.types</span> <span class="kn">import</span> <span class="n">EmbeddedResource</span><span class="p">,</span> <span class="n">ImageContent</span><span class="p">,</span> <span class="n">TextContent</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.interrobot.crawler</span> <span class="kn">import</span> <span class="n">InterroBotCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.resources</span> <span class="kn">import</span> <span class="n">RESOURCES_TOOL_NAME</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="c1"># these IDs belong to the db test fixture (interrobot.v2.db)</span>
<span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="mi">1</span>
<span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="mi">2</span>
<span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="InterroBotTests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests">[docs]</a>
<span class="k">class</span> <span class="nc">InterroBotTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test suite for the InterroBot crawler implementation.</span>
<span class="sd"> Uses all wrapped test methods from BaseCrawlerTests plus InterroBot-specific features.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="InterroBotTests.setUp">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.setUp">[docs]</a>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Set up the test environment with fixture data.</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"interrobot"</span> <span class="o">/</span> <span class="s2">"interrobot.v2.db"</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_pulse">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_pulse">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test basic crawler initialization.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_sites">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_sites">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test site retrieval API functionality.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="c1"># Note: InterroBot uses site ID 2 for pragmar instead of calculating from string</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_search">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_search">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test boolean search functionality</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_resources">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_resources">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test resource retrieval API functionality with various parameters.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_images">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_images">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_images</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test InterroBot-specific image handling and thumbnails.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_image_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_sorts">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_sorts">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test random sort functionality using the '?' sort parameter.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_content_parsing">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_content_parsing">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test content type detection and parsing.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_mcp_features">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_mcp_features">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_mcp_features</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test InterroBot-specific MCP tool functionality.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="n">list_tools_result</span> <span class="o">=</span> <span class="n">asyncio</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">crawler</span><span class="o">.</span><span class="n">mcp_list_tools</span><span class="p">())</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">list_tools_result</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_thumbnails_sync">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_thumbnails_sync">[docs]</a>
<span class="k">def</span> <span class="nf">test_thumbnails_sync</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test thumbnail generation functionality.</span>
<span class="sd"> """</span>
<span class="n">asyncio</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__test_thumbnails</span><span class="p">())</span></div>
<span class="k">async</span> <span class="k">def</span> <span class="nf">__test_thumbnails</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test thumbnails are a special case for InterroBot. Other fixtures are</span>
<span class="sd"> not dependable, either images removed to slim archive, or not captured</span>
<span class="sd"> with defaults. Testing thumbnails here is enough.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="n">thumbnail_args</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"datasrc"</span><span class="p">:</span> <span class="n">crawler</span><span class="o">.</span><span class="n">datasrc</span><span class="p">,</span>
<span class="s2">"sites"</span><span class="p">:</span> <span class="p">[</span><span class="n">PRAGMAR_SITE_ID</span><span class="p">],</span>
<span class="s2">"extras"</span><span class="p">:</span> <span class="p">[</span><span class="s2">"thumbnails"</span><span class="p">],</span>
<span class="s2">"query"</span><span class="p">:</span> <span class="s2">"type: img AND url: *.png"</span><span class="p">,</span>
<span class="s2">"limit"</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span>
<span class="p">}</span>
<span class="n">thumbnail_result</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">TextContent</span> <span class="o">|</span> <span class="n">ImageContent</span> <span class="o">|</span> <span class="n">EmbeddedResource</span><span class="p">]</span> <span class="o">=</span> <span class="k">await</span> <span class="n">crawler</span><span class="o">.</span><span class="n">mcp_call_tool</span><span class="p">(</span>
<span class="n">RESOURCES_TOOL_NAME</span><span class="p">,</span> <span class="n">thumbnail_args</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">thumbnail_result</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="n">thumbnail_result</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="s2">"image"</span><span class="p">,</span>
<span class="s2">"ImageContent should be included in thumbnails response"</span>
<span class="p">)</span>
<div class="viewcode-block" id="InterroBotTests.test_interrobot_advanced_site_features">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_interrobot_advanced_site_features">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_advanced_site_features</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test InterroBot-specific site features like robots field.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="c1"># robots field retrieval</span>
<span class="n">site_one_field_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">(</span><span class="n">ids</span><span class="o">=</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"urls"</span><span class="p">])</span>
<span class="k">if</span> <span class="n">site_one_field_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="n">result_dict</span> <span class="o">=</span> <span class="n">site_one_field_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"urls"</span><span class="p">,</span> <span class="n">result_dict</span><span class="p">,</span> <span class="s2">"robots field should be present in response"</span><span class="p">)</span>
<span class="c1"># multiple custom fields</span>
<span class="n">site_multiple_fields_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">(</span><span class="n">ids</span><span class="o">=</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"urls"</span><span class="p">,</span> <span class="s2">"created"</span><span class="p">])</span>
<span class="k">if</span> <span class="n">site_multiple_fields_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">site_multiple_fields_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"urls"</span><span class="p">,</span> <span class="n">result</span><span class="p">,</span> <span class="s2">"robots field should be present in response"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"created"</span><span class="p">,</span> <span class="n">result</span><span class="p">,</span> <span class="s2">"created field should be present in response"</span><span class="p">)</span></div>
<div class="viewcode-block" id="InterroBotTests.test_report">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.tests.InterroBotTests.test_report">[docs]</a>
<span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Run test report, save to data directory.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">InterroBotCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fixture_path</span><span class="p">)</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">"InterroBot"</span><span class="p">))</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/adapter.py:
--------------------------------------------------------------------------------
```python
import os
import hashlib
import mimetypes
import re
import sqlite3
import traceback
from contextlib import closing
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from dataclasses import dataclass
from datetime import timezone
from typing import Final
from mcp_server_webcrawl.models.resources import (
ResourceResult,
ResourceResultType,
RESOURCES_DEFAULT_FIELD_MAPPING,
RESOURCES_DEFAULT_SORT_MAPPING,
RESOURCES_FIELDS_BASE,
RESOURCES_ENUMERATED_TYPE_MAPPING,
RESOURCES_LIMIT_MAX,
)
from mcp_server_webcrawl.utils import to_isoformat_zulu, from_isoformat_zulu
from mcp_server_webcrawl.utils.search import SearchQueryParser, SearchSubquery
from mcp_server_webcrawl.utils.logger import get_logger
# in the interest of sane imports (avoiding circulars), INDEXED_* constants
# live here, happily, as denizens of adapterville
INDEXED_BATCH_SIZE: Final[int] = 256
INDEXED_BINARY_EXTENSIONS: Final[tuple[str, ...]] = (
".woff",".woff2",".ttf",".otf",".eot",
".jpeg",".jpg",".png",".webp",".gif",".bmp",".tiff",".tif",".svg",".ico",".heic",".heif",
".mp3",".wav",".ogg",".flac",".aac",".m4a",".wma",
".mp4",".webm",".avi",".mov",".wmv",".mkv",".flv",".m4v",".mpg",".mpeg",
".pdf",".doc",".docx",".xls",".xlsx",".ppt",".pptx",
".zip",".rar",".7z",".tar",".gz",".bz2",".xz",
".exe",".dll",".so",".dylib",".bin",".apk",".app",
".swf",".svgz",".dat",".db",".sqlite",".class",".pyc",".o"
)
INDEXED_BYTE_MULTIPLIER: Final[dict[str, int]] = {
"b": 1,
"kb": 1024,
"kB": 1024,
"mb": 1024*1024,
"MB": 1024*1024,
"gb": 1024*1024*1024,
"GB": 1024*1024*1024,
}
INDEXED_EXTENSION_MAPPING: Final[dict[str, str]] = {
# image/*
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
".svg": "image/svg+xml",
".bmp": "image/bmp",
".ico": "image/x-icon",
".tiff": "image/tiff",
".tif": "image/tiff",
".heic": "image/heic",
".heif": "image/heif",
# text/*
".html": "text/html",
".htm": "text/html",
".css": "text/css",
".js": "application/javascript",
".json": "application/json",
".xml": "application/xml",
".txt": "text/plain",
# application/*
".pdf": "application/pdf",
".doc": "application/msword",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".xls": "application/vnd.ms-excel",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
# audio/*
".mp3": "audio/mpeg",
".wav": "audio/wav",
".ogg": "audio/ogg",
".flac": "audio/flac",
".aac": "audio/aac",
".m4a": "audio/mp4",
# video/*
".mp4": "video/mp4",
".webm": "video/webm",
".avi": "video/x-msvideo",
".mov": "video/quicktime",
".mkv": "video/x-matroska",
# font/*
".woff": "font/woff",
".woff2": "font/woff2",
".ttf": "font/ttf",
".otf": "font/otf",
".eot": "application/vnd.ms-fontobject",
}
INDEXED_IGNORE_DIRECTORIES: Final[list[str]] = ["http-client-cache", "result-storage"]
# maximum indexes held in cache, an index is a unique list[site-ids] argument
INDEXED_MANAGER_CACHE_MAX: Final[int] = 20
# 2MB max HTTP content, anything larger passed over by fulltext indexer
INDEXED_MAX_FILE_SIZE: Final[int] = 2000000
# max indexing time may need a cli arg to override at some point,
# but for now, this is a fan spinner--just make sure it doesn't run away
INDEXED_MAX_PROCESS_TIME: Final[timedelta] = timedelta(minutes=10)
INDEXED_MAX_WORKERS: Final[int] = min(8, os.cpu_count() or 4)
INDEXED_MIME_FALLBACKS: Final[dict[ResourceResultType, str]] = {
ResourceResultType.PAGE: "text/html",
ResourceResultType.CSS: "text/css",
ResourceResultType.SCRIPT: "application/javascript",
ResourceResultType.IMAGE: "image/jpeg", # default for type, override
ResourceResultType.PDF: "application/pdf",
ResourceResultType.TEXT: "text/plain",
ResourceResultType.DOC: "application/msword",
ResourceResultType.AUDIO: "audio/mpeg", # default for type, override
ResourceResultType.VIDEO: "video/mp4", # default for type, override
ResourceResultType.OTHER: "application/octet-stream"
}
INDEXED_MIME_MAPPING: Final[dict[str, ResourceResultType]] = {
"html": ResourceResultType.PAGE,
"javascript": ResourceResultType.SCRIPT,
"css": ResourceResultType.CSS,
"image/": ResourceResultType.IMAGE,
"pdf": ResourceResultType.PDF,
"text/": ResourceResultType.TEXT,
"audio/": ResourceResultType.AUDIO,
"video/": ResourceResultType.VIDEO,
"application/json": ResourceResultType.TEXT,
"application/xml": ResourceResultType.TEXT
}
# files on disk will need default for reassembly {proto}{dir}
# these things are already approximations (perhaps) having passed through wget
# filtering (--adjust-extension) representative of the file on disk, also https
# is what the LLM is going to guess in all cases
INDEXED_RESOURCE_DEFAULT_PROTOCOL: Final[str] = "https://"
INDEXED_TEXT_APPLICATION_TYPES: Final[tuple[str, ...]] = (
"application/json", "application/xml", "application/javascript",
"application/atom+xml", "application/ld+json", "application/rss+xml",
"application/x-www-form-urlencoded",
)
INDEXED_TYPE_MAPPING: Final[dict[str, ResourceResultType]] = {
"": ResourceResultType.PAGE,
".html": ResourceResultType.PAGE,
".htm": ResourceResultType.PAGE,
".php": ResourceResultType.PAGE,
".asp": ResourceResultType.PAGE,
".aspx": ResourceResultType.PAGE,
".js": ResourceResultType.SCRIPT,
".css": ResourceResultType.CSS,
".jpg": ResourceResultType.IMAGE,
".jpeg": ResourceResultType.IMAGE,
".png": ResourceResultType.IMAGE,
".gif": ResourceResultType.IMAGE,
".svg": ResourceResultType.IMAGE,
".tif": ResourceResultType.IMAGE,
".tiff": ResourceResultType.IMAGE,
".webp": ResourceResultType.IMAGE,
".bmp": ResourceResultType.IMAGE,
".pdf": ResourceResultType.PDF,
".txt": ResourceResultType.TEXT,
".xml": ResourceResultType.TEXT,
".json": ResourceResultType.TEXT,
".doc": ResourceResultType.DOC,
".docx": ResourceResultType.DOC,
".mov": ResourceResultType.VIDEO,
".mp4": ResourceResultType.VIDEO,
".mp3": ResourceResultType.AUDIO,
".ogg": ResourceResultType.AUDIO,
}
INDEXED_WARC_EXTENSIONS: Final[list[str]] = [".warc", ".warc.gz", ".txt"]
logger = get_logger()
class IndexStatus(Enum):
UNDEFINED = ""
IDLE = "idle"
INDEXING = "indexing"
PARTIAL = "partial" # incomplete, but stable and searchable (timeout)
COMPLETE = "complete"
REMOTE = "remote"
FAILED = "failed"
@dataclass
class IndexState:
"""Shared state between crawler and manager for indexing progress"""
status: IndexStatus = IndexStatus.UNDEFINED
processed: int = 0
time_start: datetime | None = None
time_end: datetime | None = None
def set_status(self, status: IndexStatus) -> None:
if self.status == IndexStatus.UNDEFINED:
self.time_start = datetime.now(timezone.utc)
self.processed = 0
self.time_end = None
elif status in (IndexStatus.COMPLETE, IndexStatus.PARTIAL):
if self.time_end is None:
self.time_end = datetime.now(timezone.utc)
if status == IndexStatus.PARTIAL:
logger.info(f"Indexing timeout ({INDEXED_MAX_PROCESS_TIME} minutes) reached. \
Index status has been set to PARTIAL, and further indexing halted.")
self.status = status
def increment_processed(self):
self.processed += 1
@property
def duration(self) -> str:
if not self.time_start:
return "00:00:00.000"
end = self.time_end or datetime.now(timezone.utc)
total_seconds = (end - self.time_start).total_seconds()
hours = int(total_seconds // 3600)
minutes = int((total_seconds % 3600) // 60)
seconds = int(total_seconds % 60)
milliseconds = int((total_seconds % 1) * 1000)
# HH:MM:SS.mmm
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
def is_timeout(self) -> bool:
"""
Check if the indexing operation has exceeded the timeout threshold
"""
if not self.time_start:
return False
return (datetime.now(timezone.utc) - self.time_start) > INDEXED_MAX_PROCESS_TIME
def to_dict(self) -> dict:
"""
Convert the IndexState to a dictionary representation
"""
status = self.status.value if hasattr(self.status, 'value') else self.status
result = { "status": status }
if self.status not in (IndexStatus.REMOTE, IndexStatus.UNDEFINED):
result["processed"] = self.processed
result["time_start"] = to_isoformat_zulu(self.time_start) if self.time_start else None
result["time_end"] = to_isoformat_zulu(self.time_end) if self.time_end else None
result["duration"] = self.duration
return result
class SitesGroup:
def __init__(self, datasrc: Path, site_ids: list[int], site_paths: list[Path]) -> None:
"""
Container class supports the searching of one or more sites at once.
Args:
datasrc: site datasrc
site_ids: site ids of the sites
site_paths: paths to site contents (directories)
"""
self.datasrc: Path = datasrc
self.ids: list[int] = site_ids
self.paths: list[Path] = site_paths
self.cache_key = frozenset(map(str, site_ids))
def __str__(self) -> str:
return f"[SitesGroup {self.cache_key}]"
def get_sites(self) -> dict[int, str]:
# unwrap { id1: path1, id2: path2 }
return {site_id: str(path) for site_id, path in zip(self.ids, self.paths)}
class SitesStat:
def __init__(self, group: SitesGroup, cached: bool) -> None:
"""
Some basic bookeeping, for troubleshooting
Args:
group: SitesGroup to track statistics for
cached: whether the group was retrieved from cache
"""
self.group: Final[SitesGroup] = group
self.timestamp: Final[datetime] = datetime.now()
self.cached: Final[bool] = cached
class BaseManager:
"""
Base class for managing web crawler data in in-memory SQLite databases.
Provides connection pooling and caching for efficient access.
"""
def __init__(self) -> None:
"""Initialize the manager with statistics."""
self._stats: list[SitesStat] = []
@staticmethod
def string_to_id(value: str) -> int:
"""
Convert a string, such as a directory name, to a numeric ID
suitable for a database primary key.
Hash space and collision probability notes:
- [:8] = 32 bits (4.29 billion values) - ~1% collision chance with 10,000 items
- [:12] = 48 bits (280 trillion values) - ~0.0000001% collision chance with 10,000 items
- [:16] = 64 bits (max safe SQLite INTEGER) - near-zero collision, 9.22 quintillion values
- SQLite INTEGER type is 64-bit signed, with max value of 9,223,372,036,854,775,807.
- The big problem with larger hashspaces is the length of the ids they generate for presentation.
Args:
value: Input string to convert to an ID
Returns:
Integer ID derived from the input string
"""
hash_obj = hashlib.sha1(value.encode())
return int(hash_obj.hexdigest()[:12], 16)
@staticmethod
def get_basic_headers(file_size: int, resource_type: ResourceResultType, path: Path) -> str:
"""
Generate basic HTTP headers for a resource.
Args:
file_size: size of the file in bytes
resource_type: type of resource to generate headers for
path: file path used for MIME type detection
Returns:
HTTP headers string with content type and length
"""
fallback_mime_default = "application/octet-stream"
if resource_type in (ResourceResultType.IMAGE, ResourceResultType.AUDIO, ResourceResultType.VIDEO):
# get file mime if type/ext not one-to-one
extension = path.suffix.lower()
content_type = INDEXED_EXTENSION_MAPPING.get(extension)
if not content_type:
content_type = INDEXED_MIME_FALLBACKS.get(resource_type, fallback_mime_default)
elif resource_type == ResourceResultType.OTHER:
# aquire from file if unknown
mime_type, _ = mimetypes.guess_type(str(path))
content_type = mime_type if mime_type is not None else fallback_mime_default
else:
# normal one-to-one mapping
content_type = INDEXED_MIME_FALLBACKS.get(resource_type, fallback_mime_default)
return f"HTTP/1.0 200 OK\r\nContent-Type: {content_type}\r\nContent-Length: {file_size}\r\n\r\n"
@staticmethod
def read_files(paths: list[Path]) -> dict[Path, str | None]:
"""
Read content from multiple files concurrently.
Args:
paths: list of file paths to read
Returns:
dictionary mapping file paths to their content or None for binary/unreadable files
"""
file_contents: dict[Path, str | None] = {}
with ThreadPoolExecutor(max_workers=INDEXED_MAX_WORKERS) as executor:
for file_path, content in executor.map(BaseManager.__read_files_contents, paths):
if content is not None:
file_contents[file_path] = content
return file_contents
@staticmethod
def __read_files_contents(file_path: Path) -> tuple[Path, str | None]:
"""
Read content from text files with better error handling and encoding detection.
Args:
file_path: path to the file to read
Returns:
tuple of file path and content string, or None for binary/unreadable files
"""
# a null result just means we're not dealing with the content
# which has been determined to be binary or of unknown format
# we can still maintain a record the URL/headers/whatever as Resource
null_result: tuple[Path, str] = file_path, None
extension = os.path.splitext(file_path)[1].lower()
if (extension in INDEXED_BINARY_EXTENSIONS or
os.path.getsize(file_path) > INDEXED_MAX_FILE_SIZE):
return null_result
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type and not mime_type.startswith("text/") and mime_type not in INDEXED_TEXT_APPLICATION_TYPES:
return null_result
content = None
try:
# errors="ignore" or "replace" required to read Katana txt files with
# data payloads and still capture url, headers, etc. replace supposedly
# softer touch generally, but not any better for Katana specifically
# as payload will not be stored
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
except UnicodeDecodeError:
logger.debug(f"Could not decode file as UTF-8: {file_path}")
return null_result
except Exception as ex:
logger.error(f"Error reading file {file_path}")
return null_result
return file_path, content
@staticmethod
def read_file_contents(file_path: Path, resource_type: ResourceResultType) -> str | None:
"""
Read content from text files with better error handling and encoding detection.
Args:
file_path: path to the file to read
resource_type: type of resource to determine if content should be read
Returns:
file content as string or None for binary/unreadable files
"""
if resource_type not in [ResourceResultType.PAGE, ResourceResultType.TEXT,
ResourceResultType.CSS, ResourceResultType.SCRIPT, ResourceResultType.OTHER]:
return None
if os.path.getsize(file_path) > INDEXED_MAX_FILE_SIZE:
return None
extension = os.path.splitext(file_path)[1].lower()
if extension in INDEXED_BINARY_EXTENSIONS:
return None
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type and not mime_type.startswith("text/"):
if not any(mime_type.startswith(prefix) for prefix in INDEXED_TEXT_APPLICATION_TYPES):
return None
content = None
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except UnicodeDecodeError:
logger.warning(f"Could not decode file as UTF-8: {file_path}")
return content
@staticmethod
def decruft_path(path: str) -> str:
"""
Very light touch cleanup of file naming, these tmps are creating noise
and extensions are useful in classifying resources
Args:
path: file path string to clean up
Returns:
cleaned path string with temp files and weird extensions normalized
"""
# clean path/file from wget modifications we don't want
decruftified = str(path)
decruftified = decruftified.lower()
decruftified = re.sub(r"[\u00b7·]?\d+\.tmp|\d{12}|\.tmp", "", decruftified)
# clean extension from non alpha
# S1/wget can generate some weird extensions with URL args
# filenames such as main.min.js202505251919
decruftified = re.sub(r'\.(\w+)[^\w]*$', r'.\1', decruftified)
return decruftified
def get_stats(self) -> list[SitesStat]:
return self._stats.copy()
def get_resources_for_sites_group(
self,
sites_group: SitesGroup,
query: str,
fields: list[str] | None,
sort: str | None,
limit: int,
offset: int,
swap_values: dict = {}
) -> tuple[list[ResourceResult], int, IndexState]:
"""
Get resources from directories using structured query parsing with SearchQueryParser.
This method extracts types, fields, and statuses from the querystring instead of
accepting them as separate arguments, using the new SearchSubquery functionality.
Args:
sites_group: Group of sites to search in
query: Search query string that can include field:value syntax for filtering
fields: resource fields to be returned by the API (Content, Headers, etc.)
sort: Sort order for results
limit: Maximum number of results to return
offset: Number of results to skip for pagination
swap_values: per-field parameterized values to check for (and replace)
Returns:
Tuple of (list of ResourceResult objects, total count, connection_index_state)
Notes:
Returns empty results if sites is empty or not provided.
If the database is being built, it will log a message and return empty results.
This method extracts field-specific filters from the query string using SearchQueryParser:
- type:html (to filter by resource type)
- status:200 (to filter by HTTP status)
Any fields present in the SearchSubquery will be included in the response.
"""
# get_connection must be defined in subclass
assert hasattr(self, "get_connection"), "get_connection not found"
null_result: tuple[list[ResourceResult], int, IndexState | None] = [], 0, None
# get sites arg from group
sites: list[int] = sites_group.ids
if not sites or not sites_group or len(sites) == 0:
return null_result
connection: sqlite3.Connection
connection_index_state: IndexState
connection, connection_index_state = self.get_connection(sites_group)
if connection is None:
# database is currently being built
logger.info(f"Database for sites {sites} is currently being built, try again later")
return null_result
parser: SearchQueryParser = SearchQueryParser()
parsed_query: list[SearchSubquery] = []
if query.strip():
try:
parsed_query = parser.parse(query.strip())
except Exception as ex:
logger.error(f"Error parsing query: {ex}")
# fall back to simple text search
parsed_query = parsed_query or []
# if status not explicitly in query, add status >=100
status_applied: bool = False
for squery in parsed_query:
if squery.field == "status":
status_applied = True
break
if not status_applied:
# add default status constraint ANDed at end
http_status_received = SearchSubquery("status", 100, "term", [], "AND", comparator=">=")
parsed_query.append(http_status_received)
# determine fields to be retrieved
selected_fields: set[str] = set(RESOURCES_FIELDS_BASE)
if fields:
selected_fields.update(f for f in fields if f in RESOURCES_DEFAULT_FIELD_MAPPING)
safe_sql_fields = [RESOURCES_DEFAULT_FIELD_MAPPING[f] for f in selected_fields]
assert all(re.match(r'^[A-Za-z\.]+$', field) for field in safe_sql_fields), "Unknown or unsafe field requested"
safe_sql_fields_joined: str = ", ".join(safe_sql_fields)
from_clause = "ResourcesFullText LEFT JOIN Resources ON ResourcesFullText.Id = Resources.Id"
where_clauses: list[str] = []
params: dict[str, int | str] = {}
if sites:
placeholders: list[str] = [f":sites{i}" for i in range(len(sites))]
where_clauses.append(f"ResourcesFullText.Project IN ({','.join(placeholders)})")
params.update({f"sites{i}": id_val for i, id_val in enumerate(sites)})
if parsed_query:
fts_parts, fts_params = parser.to_sqlite_fts(parsed_query, swap_values)
if fts_parts:
fts_where = ""
for part in fts_parts:
if part in ["AND", "OR", "NOT"]: # operator
fts_where += f" {part} "
else: # condition
fts_where += part
# fts subquery as a single condition in parentheses
if fts_where:
where_clauses.append(f"({fts_where})")
for param_name, param_value in fts_params.items():
params[param_name] = param_value
where_clause: str = f" WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
if sort in RESOURCES_DEFAULT_SORT_MAPPING:
field, direction = RESOURCES_DEFAULT_SORT_MAPPING[sort]
if direction == "RANDOM":
order_clause: str = " ORDER BY RANDOM()"
else:
order_clause = f" ORDER BY {field} {direction}"
else:
order_clause = " ORDER BY ResourcesFullText.Url ASC"
assert isinstance(limit, int), "limit must be an integer"
assert isinstance(offset, int), "offset must be an integer"
limit = min(max(1, limit), RESOURCES_LIMIT_MAX)
params["limit"] = limit
params["offset"] = offset
limit_clause = " LIMIT :limit OFFSET :offset"
statement: str = f"SELECT {safe_sql_fields_joined} FROM {from_clause}{where_clause}{order_clause}{limit_clause}"
results: list[ResourceResult] = []
total_count: int = 0
try:
with closing(connection.cursor()) as cursor:
cursor.execute(statement, params)
rows = cursor.fetchall()
if rows:
column_names = [description[0].lower() for description in cursor.description]
for row in rows:
row_dict = {column_names[i]: row[i] for i in range(len(column_names))}
type_value = row_dict.get("type", "")
resource_type = ResourceResultType.UNDEFINED
# map the type string back to enum
for rt in ResourceResultType:
if rt.value == type_value:
resource_type = rt
break
if resource_type == ResourceResultType.UNDEFINED and isinstance(type_value, int):
if type_value in RESOURCES_ENUMERATED_TYPE_MAPPING:
resource_type = RESOURCES_ENUMERATED_TYPE_MAPPING[type_value]
result = ResourceResult(
id=row_dict.get("id"),
site=row_dict.get("project"),
created=from_isoformat_zulu(row_dict.get("created")),
modified=from_isoformat_zulu(row_dict.get("modified")),
url=row_dict.get("url", ""),
type=resource_type,
name=row_dict.get("name"),
headers=row_dict.get("headers"),
content=row_dict.get("content") if "content" in selected_fields else None,
status=row_dict.get("status"),
size=row_dict.get("size"),
time=row_dict.get("time"),
metadata=None,
)
results.append(result)
# get total count
if len(results) < limit:
total_count = offset + len(results)
else:
count_statement = f"SELECT COUNT(*) as total FROM {from_clause}{where_clause}"
cursor.execute(count_statement, params)
count_row = cursor.fetchone()
total_count = count_row[0] if count_row else 0
except sqlite3.Error as ex:
logger.error(f"SQLite error in structured query: {ex}\n{statement}\n{traceback.format_exc()}")
return null_result
return results, total_count, connection_index_state
def _load_site_data(self, connection: sqlite3.Connection, site_path: Path,
site_id: int, index_state: IndexState = None) -> None:
"""
Load site data into the database. To be implemented by subclasses.
Args:
connection: SQLite connection
site_path: Path to the site data
site_id: ID for the site
index_state: IndexState object for tracking progress
"""
raise NotImplementedError("Subclasses must implement _load_site_data")
def _determine_resource_type(self, content_type: str) -> ResourceResultType:
"""
Determine resource type from content type string.
Args:
content_type: HTTP content type header value
Returns:
ResourceResultType enum value based on content type
"""
content_type = content_type.lower()
for pattern, res_type in INDEXED_MIME_MAPPING.items():
if pattern in content_type:
return res_type
return ResourceResultType.OTHER
def _is_text_content(self, content_type: str) -> bool:
"""
Check if content should be stored as text. Filter out deadweight content in fts index.
Args:
content_type: HTTP content type header value
Returns:
True if content should be indexed as text, False otherwise
"""
content_type_lower = content_type.lower()
if content_type_lower.startswith("text/"):
return True
elif content_type_lower.startswith(("font/", "image/", "audio/", "video/", "application/octet-stream")):
return False
elif content_type_lower.startswith("application/"):
return content_type_lower in INDEXED_TEXT_APPLICATION_TYPES
else:
return True
```