This is page 4 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl/crawlers.html?page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/extras/markdown.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.extras.markdown — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.extras.markdown</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.extras.markdown</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">re</span>
<span class="kn">from</span> <span class="nn">importlib</span> <span class="kn">import</span> <span class="n">resources</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span><span class="p">,</span> <span class="n">Any</span>
<span class="kn">from</span> <span class="nn">lxml</span> <span class="kn">import</span> <span class="n">etree</span><span class="p">,</span> <span class="n">html</span>
<span class="kn">from</span> <span class="nn">lxml.etree</span> <span class="kn">import</span> <span class="n">ParserError</span>
<span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">__XSLT_RESULT_CLEANER</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">re</span><span class="o">.</span><span class="n">Pattern</span><span class="p">]</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"(?:\n\s*-\s*\n|\n\s*\n)+"</span><span class="p">)</span>
<span class="n">__RE_HTML</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">re</span><span class="o">.</span><span class="n">Pattern</span><span class="p">]</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"<[a-zA-Z]+[^>]*>"</span><span class="p">)</span>
<span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="MarkdownTransformer">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.markdown.MarkdownTransformer">[docs]</a>
<span class="k">class</span> <span class="nc">MarkdownTransformer</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Memoizes the XSLT transformer</span>
<span class="sd"> """</span>
<span class="n">_xslt_transform</span> <span class="o">=</span> <span class="kc">None</span>
<div class="viewcode-block" id="MarkdownTransformer.get_xslt_transform">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.markdown.MarkdownTransformer.get_xslt_transform">[docs]</a>
<span class="nd">@classmethod</span>
<span class="k">def</span> <span class="nf">get_xslt_transform</span><span class="p">(</span><span class="bp">cls</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Get the HTML to text markdown XSLT transformer</span>
<span class="sd"> """</span>
<span class="k">if</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_xslt_transform</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">xslt_string</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">resources</span><span class="o">.</span><span class="n">read_text</span><span class="p">(</span><span class="s2">"mcp_server_webcrawl.templates"</span><span class="p">,</span> <span class="s2">"markdown.xslt"</span><span class="p">)</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span>
<span class="n">xslt_doc</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">xslt_string</span><span class="p">)</span>
<span class="bp">cls</span><span class="o">.</span><span class="n">_xslt_transform</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XSLT</span><span class="p">(</span><span class="n">xslt_doc</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_xslt_transform</span></div>
</div>
<div class="viewcode-block" id="get_markdown">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.markdown.get_markdown">[docs]</a>
<span class="k">def</span> <span class="nf">get_markdown</span><span class="p">(</span><span class="n">content</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Transform HTML content to Markdown using XSLT.</span>
<span class="sd"> Args:</span>
<span class="sd"> content (str): The HTML content to transform.</span>
<span class="sd"> Returns:</span>
<span class="sd"> str | None: The transformed Markdown string, or None if the input is empty</span>
<span class="sd"> or if transformation fails (e.g., due to invalid HTML or XSLT errors).</span>
<span class="sd"> """</span>
<span class="n">transformer</span> <span class="o">=</span> <span class="n">MarkdownTransformer</span><span class="o">.</span><span class="n">get_xslt_transform</span><span class="p">()</span>
<span class="n">content</span><span class="p">:</span><span class="nb">str</span> <span class="o">=</span> <span class="n">content</span> <span class="ow">or</span> <span class="s2">""</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">content</span><span class="p">,</span> <span class="nb">str</span><span class="p">),</span> <span class="s2">"String (HTML) required for transformer"</span>
<span class="k">assert</span> <span class="n">transformer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">content</span> <span class="o">==</span> <span class="s2">""</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">__RE_HTML</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">content</span><span class="p">):</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">doc</span> <span class="o">=</span> <span class="n">html</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">content</span><span class="p">)</span>
<span class="n">result</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">transformer</span><span class="p">(</span><span class="n">doc</span><span class="p">))</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">__XSLT_RESULT_CLEANER</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">"</span><span class="se">\n\n</span><span class="s2">"</span><span class="p">,</span> <span class="n">result</span><span class="p">)</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">return</span> <span class="n">result</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"XSLT transform error: </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">ex</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="se">\n</span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
<span class="k">return</span> <span class="kc">None</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl package — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="mcp_server_webcrawl.crawlers package" href="mcp_server_webcrawl.crawlers.html" />
<link rel="prev" title="mcp_server_webcrawl" href="modules.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="current reference internal" href="#">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/mcp_server_webcrawl.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl-package">
<h1>mcp_server_webcrawl package<a class="headerlink" href="#mcp-server-webcrawl-package" title="Link to this heading"></a></h1>
<section id="subpackages">
<h2>Subpackages<a class="headerlink" href="#subpackages" title="Link to this heading"></a></h2>
<div class="toctree-wrapper compound">
<ul>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html#subpackages">Subpackages</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html#module-mcp_server_webcrawl.crawlers">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.extras.html">mcp_server_webcrawl.extras package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.markdown">mcp_server_webcrawl.extras.markdown module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.regex">mcp_server_webcrawl.extras.regex module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.snippets">mcp_server_webcrawl.extras.snippets module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.thumbnails">mcp_server_webcrawl.extras.thumbnails module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.xpath">mcp_server_webcrawl.extras.xpath module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.interactive.html">mcp_server_webcrawl.interactive package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.highlights">mcp_server_webcrawl.interactive.highlights module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.search">mcp_server_webcrawl.interactive.search module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.session">mcp_server_webcrawl.interactive.session module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.ui">mcp_server_webcrawl.interactive.ui module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.models.html">mcp_server_webcrawl.models package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.models.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models.resources">mcp_server_webcrawl.models.resources module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models.sites">mcp_server_webcrawl.models.sites module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.templates.html">mcp_server_webcrawl.templates package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.templates.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.templates.html#module-mcp_server_webcrawl.templates.tests">mcp_server_webcrawl.templates.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.templates.html#module-mcp_server_webcrawl.templates">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.utils.html">mcp_server_webcrawl.utils package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.utils.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.cli">mcp_server_webcrawl.utils.cli module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.logger">mcp_server_webcrawl.utils.logger module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.server">mcp_server_webcrawl.utils.server module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.tools">mcp_server_webcrawl.utils.tools module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils">Module contents</a></li>
</ul>
</li>
</ul>
</div>
</section>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="mcp-server-webcrawl-main-module">
<h2>mcp_server_webcrawl.main module<a class="headerlink" href="#mcp-server-webcrawl-main-module" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py">
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">main</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/main.html#main"><span class="viewcode-link"><span class="pre">[source]</span></span></a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>crawler</strong> (<a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – </p></li>
</ul>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.settings">
<span id="mcp-server-webcrawl-settings-module"></span><h2>mcp_server_webcrawl.settings module<a class="headerlink" href="#module-mcp_server_webcrawl.settings" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl.settings_local">
<span id="mcp-server-webcrawl-settings-local-module"></span><h2>mcp_server_webcrawl.settings_local module<a class="headerlink" href="#module-mcp_server_webcrawl.settings_local" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl" title="Link to this heading"></a></h2>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="modules.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="mcp_server_webcrawl.crawlers.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.crawlers package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/indexed.py:
--------------------------------------------------------------------------------
```python
import sqlite3
import traceback
from datetime import datetime
from contextlib import closing, contextmanager
from pathlib import Path
from typing import Callable
from mcp.types import Tool
from mcp_server_webcrawl.crawlers.base.adapter import (
BaseManager,
IndexState,
IndexStatus,
SitesGroup,
SitesStat,
INDEXED_MANAGER_CACHE_MAX,
INDEXED_RESOURCE_DEFAULT_PROTOCOL,
INDEXED_IGNORE_DIRECTORIES,
)
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.models.resources import (
ResourceResult,
ResourceResultType,
RESOURCES_DEFAULT_FIELD_MAPPING,
)
from mcp_server_webcrawl.models.sites import (
SiteResult,
SiteType,
SITES_FIELDS_BASE,
SITES_FIELDS_DEFAULT,
)
from mcp_server_webcrawl.utils import to_isoformat_zulu
from mcp_server_webcrawl.utils.logger import get_logger
from mcp_server_webcrawl.utils.tools import get_crawler_tools
logger = get_logger()
class IndexedManager(BaseManager):
def __init__(self):
super().__init__()
self._db_cache: dict[frozenset, tuple[sqlite3.Connection, IndexState]] = {}
self._build_locks: dict[frozenset, tuple[datetime, str]] = {}
def get_connection(self, group: SitesGroup) -> tuple[sqlite3.Connection | None, IndexState]:
"""
Get database connection for sites in the group, creating if needed.
Args:
group: group of sites to connect to
Returns:
Tuple of (SQLite connection to in-memory database with data loaded or None if building,
IndexState associated with this database)
"""
if group.cache_key in self._build_locks:
build_time, status = self._build_locks[group.cache_key]
get_logger().info(f"Database for {group} is currently {status} (started at {build_time})")
return None, IndexState() # Return empty IndexState for building databases
if len(self._db_cache) >= INDEXED_MANAGER_CACHE_MAX:
logger.warning(f"Cache limit reached ({INDEXED_MANAGER_CACHE_MAX}), clearing all cached databases")
self._db_cache.clear()
is_cached: bool = group.cache_key in self._db_cache
self._stats.append(SitesStat(group, is_cached))
if not is_cached:
index_state = IndexState()
index_state.set_status(IndexStatus.INDEXING)
with self._building_lock(group):
connection: sqlite3.Connection = sqlite3.connect(":memory:", check_same_thread=False)
self._setup_database(connection)
for site_id, site_path in group.get_sites().items():
self._load_site_data(connection, Path(site_path), site_id, index_state=index_state)
if index_state.is_timeout():
index_state.set_status(IndexStatus.PARTIAL)
break
if index_state is not None and index_state.status == IndexStatus.INDEXING:
index_state.set_status(IndexStatus.COMPLETE)
self._db_cache[group.cache_key] = (connection, index_state)
# returns cached or newly created connection with IndexState
connection, index_state = self._db_cache[group.cache_key]
return connection, index_state
def get_sites_for_directories(
self,
datasrc: Path,
ids: list[int] | None = None,
fields: list[str] | None = None
) -> list[SiteResult]:
"""
List site directories in the datasrc directory as sites.
Args:
datasrc: path to the directory containing site subdirectories
ids: optional list of site IDs to filter by
fields: optional list of fields to include in the response
Returns:
List of SiteResult objects, one for each site directory
Notes:
Returns an empty list if the datasrc directory doesn't exist.
"""
assert datasrc is not None, f"datasrc not provided ({datasrc})"
if not datasrc.exists():
logger.error(f"Directory not found ({datasrc})")
return []
# determine which fields to include
select_fields: set[str] = set(SITES_FIELDS_BASE)
if fields:
valid_fields: set[str] = set(SITES_FIELDS_DEFAULT)
select_fields.update(f for f in fields if f in valid_fields)
else:
select_fields.update(SITES_FIELDS_DEFAULT)
results: list[SiteResult] = []
# get all directories that contain HTTP text files
site_directories = [d for d in datasrc.iterdir() if d.is_dir() and
not d.name.startswith(".") and not d.name in INDEXED_IGNORE_DIRECTORIES]
# map directory IDs to paths for filtering
site_directories_map: dict[int, Path] = {BaseManager.string_to_id(d.name): d for d in site_directories}
if ids:
site_directories_map = {id_val: path for id_val, path in site_directories_map.items() if id_val in ids}
# process each directory
for site_id, site_directory in sorted(site_directories_map.items()):
site_directory_stat = site_directory.stat()
created_time: datetime = datetime.fromtimestamp(site_directory_stat.st_ctime)
modified_time: datetime = datetime.fromtimestamp(site_directory_stat.st_mtime)
# check for robots.txt
robots_content = None
robots_files = list(site_directory.glob("*robots.txt*"))
if robots_files:
try:
with open(robots_files[0], "r", encoding="utf-8", errors="replace") as f:
# for robots.txt files in our format, extract only the content part
content = f.read()
parts = content.split("\n\n", 2)
if len(parts) >= 3:
response_parts = parts[2].split("\n\n", 1)
if len(response_parts) > 1:
robots_content = response_parts[1]
else:
robots_content = response_parts[0]
else:
robots_content = content
except Exception as ex:
logger.error(f"Error reading robots.txt")
site = SiteResult(
path=site_directory,
id=site_id,
name=site_directory.name, # NEW: directory name
type=SiteType.CRAWLED_URL, # NEW: always single-site crawls
urls=[f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{site_directory.name}/"], # CHANGED: now a list
created=created_time if "created" in select_fields else None,
modified=modified_time if "modified" in select_fields else None,
robots=robots_content,
metadata=None,
)
results.append(site)
return results
@contextmanager
def _building_lock(self, group: SitesGroup):
"""
Context manager for database building operations.
Sets a lock during database building and releases it when done.
Args:
group: SitesGroup to set the build lock for
"""
try:
self._build_locks[group.cache_key] = (datetime.now(), "building")
yield
except Exception as ex:
self._build_locks[group.cache_key] = (self._build_locks[group.cache_key][0], f"failed: {ex}")
raise # re-raise
finally:
# clean up the lock
self._build_locks.pop(group.cache_key, None)
def _setup_database(self, connection: sqlite3.Connection) -> None:
"""
Create the database schema for storing resource data.
Args:
connection: SQLite connection to set up
"""
# store project/site (also) in fulltext, doesn't suppport >= <=,
# and pure fts search is much faster, want to only introduce
# Resource table sql clauses when field specified (Status,
# Size, or Time explicitly queried)
with closing(connection.cursor()) as cursor:
connection.execute("PRAGMA encoding = \"UTF-8\"")
connection.execute("PRAGMA synchronous = OFF")
connection.execute("PRAGMA journal_mode = MEMORY")
cursor.execute("""
CREATE TABLE Resources (
Id INTEGER PRIMARY KEY,
Project INTEGER NOT NULL,
Created TEXT,
Modified TEXT,
Status INTEGER NOT NULL,
Size INTEGER NOT NULL,
Time INTEGER NOT NULL
)""")
cursor.execute("""
CREATE VIRTUAL TABLE ResourcesFullText USING fts5(
Id,
Project,
Url,
Type,
Headers,
Content,
tokenize="unicode61 remove_diacritics 0 tokenchars '-_'"
)""")
def _execute_batch_insert(self, connection: sqlite3.Connection, cursor: sqlite3.Cursor,
batch_records: list[ResourceResult]) -> None:
"""
Execute batch insert of records with transaction handling.
Inserts data into both ResourcesFullText and Resources tables.
Args:
connection: SQLite connection
cursor: SQLite cursor
batch_records: list of ResourceResult objects ready for insertion
"""
if not batch_records:
return
resources_base_records = []
resources_fts_records = []
for resource in batch_records:
resources_base_records.append((
resource.id,
resource.site,
to_isoformat_zulu(resource.created) if resource.created else None,
to_isoformat_zulu(resource.modified) if resource.modified else None,
resource.status,
resource.size if resource.size is not None else 0,
resource.time if resource.time is not None else 0,
))
resources_fts_records.append((
resource.id,
resource.site,
resource.url,
resource.type.value if resource.type else ResourceResultType.UNDEFINED.value,
resource.headers,
resource.content,
))
try:
connection.execute("BEGIN TRANSACTION")
cursor.executemany("""
INSERT INTO Resources (
Id, Project, Created, Modified, Status, Size, Time
) VALUES (?, ?, ?, ?, ?, ?, ?)
""", resources_base_records)
cursor.executemany("""
INSERT INTO ResourcesFullText (
Id, Project, Url, Type, Headers, Content
) VALUES (?, ?, ?, ?, ?, ?)
""", resources_fts_records)
connection.execute("COMMIT")
except Exception as ex:
connection.execute("ROLLBACK")
logger.error(f"Error during batch insert: {ex}\n{traceback.format_exc()}")
class IndexedCrawler(BaseCrawler):
"""
A crawler implementation for data sources that load into an in-memory sqlite.
Shares commonality between specialized crawlers.
"""
def __init__(
self,
datasrc: Path,
get_sites_func: Callable,
get_resources_func: Callable,
resource_field_mapping: dict[str, str] = RESOURCES_DEFAULT_FIELD_MAPPING
) -> None:
"""
Initialize the IndexedCrawler with a data source path and required adapter functions.
Args:
datasrc: path to the data source
get_sites_func: function to retrieve sites from the data source
get_resources_func: function to retrieve resources from the data source
resource_field_mapping: mapping of resource field names to display names
"""
assert datasrc.is_dir(), f"{self.__class__.__name__} datasrc must be a directory"
super().__init__(datasrc, get_sites_func, get_resources_func, resource_field_mapping=resource_field_mapping)
async def mcp_list_tools(self) -> list[Tool]:
"""
List available tools for this crawler.
Returns:
List of Tool objects
"""
if self._adapter_get_sites is None:
logger.error(f"_adapter_get_sites not set (function required)")
return []
all_sites = self._adapter_get_sites(self._datasrc)
default_tools: list[Tool] = get_crawler_tools(sites=all_sites)
assert len(default_tools) == 2, "expected exactly 2 Tools: sites and resources"
default_sites_tool, default_resources_tool = default_tools
all_sites_display = ", ".join([f"{s.name} (site: {s.id})" for s in all_sites])
drt_props = default_resources_tool.inputSchema["properties"]
drt_props["sites"]["description"] = ("Optional "
"list of project ID to filter search results to a specific site. In 95% "
"of scenarios, you'd filter to only one site, but many site filtering is offered for "
f"advanced search scenarios. Available sites include {all_sites_display}.")
return [default_sites_tool, default_resources_tool]
```
--------------------------------------------------------------------------------
/docs/guides/siteone.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>SiteOne MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="WARC MCP Setup Guide" href="warc.html" />
<link rel="prev" title="Katana MCP Setup Guide" href="katana.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">SiteOne MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/siteone.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="siteone-mcp-setup-guide">
<h1>SiteOne MCP Setup Guide<a class="headerlink" href="#siteone-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with SiteOne crawler.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve crawled using SiteOne.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/JOGRYbo6WwI" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p><a class="reference external" href="https://crawler.siteone.io">SiteOne Crawler</a> installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
</ul>
</section>
<section id="what-is-siteone">
<h2>What is SiteOne?<a class="headerlink" href="#what-is-siteone" title="Link to this heading"></a></h2>
<p>SiteOne is a GUI crawler that offers:</p>
<ul class="simple">
<li><p>User-friendly desktop interface for setting up and managing crawls</p></li>
<li><p>Offline website generation capabilities</p></li>
<li><p>Comprehensive crawl reporting</p></li>
<li><p>Intuitive controls for non-technical users</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
</pre></div>
</div>
<p>Verify installation was successful:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
</section>
<section id="create-crawls-with-siteone">
<h3>2. Create Crawls with SiteOne<a class="headerlink" href="#create-crawls-with-siteone" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open SiteOne Crawler application</p></li>
<li><p>Enter a URL to crawl (e.g., example.com)</p></li>
<li><p><strong>Important</strong>: Check the “Generate offline website” option (this is required for MCP integration)</p></li>
<li><p>Click the start button to begin crawling</p></li>
<li><p>Repeat for additional sites as needed (e.g., pragmar.com)</p></li>
<li><p>Note the directory where SiteOne is storing the generated offline content (this is shown in the application)</p></li>
</ol>
</section>
<section id="configure-claude-desktop">
<h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"siteone"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"/path/to/siteone/archives/"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
<li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
<li><p>Change <code class="docutils literal notranslate"><span class="pre">/path/to/siteone/archives/</span></code> to the actual path where SiteOne stores offline website content</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
<li><p>Ask Claude to list your crawled sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
</li>
<li><p>Try searching content from your crawls:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
</pre></div>
</div>
</li>
<li><p>Explore specific topics on your crawled sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">I</span><span class="s1">'m interested in [keyword] in [crawled domain]. Can you tell me about it?</span>
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
<li><p>Check that your SiteOne archives path in the configuration is correct</p></li>
<li><p>Make sure the “Generate offline website” option was checked when creating crawls</p></li>
<li><p>Verify that each crawl completed successfully and files were saved to the expected location</p></li>
<li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
</ul>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="katana.html" class="btn btn-neutral float-left" title="Katana MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="warc.html" class="btn btn-neutral float-right" title="WARC MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/guides/warc.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WARC MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="wget MCP Setup Guide" href="wget.html" />
<link rel="prev" title="SiteOne MCP Setup Guide" href="siteone.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">WARC MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/warc.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="warc-mcp-setup-guide">
<h1>WARC MCP Setup Guide<a class="headerlink" href="#warc-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with
<a class="reference external" href="https://en.wikipedia.org/wiki/WARC_(file_format)">WARC</a> files to allow your LLM (e.g.
Claude Desktop) to search content and metadata from websites you’ve archived in WARC format.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/fx-4WZu-UT8" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
<li><p>wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)</p></li>
</ul>
</section>
<section id="what-are-warc-files">
<h2>What are WARC Files?<a class="headerlink" href="#what-are-warc-files" title="Link to this heading"></a></h2>
<p>WARC files are single-file archives that store complete crawl data including:</p>
<ul class="simple">
<li><p>HTTP status codes</p></li>
<li><p>HTTP headers</p></li>
<li><p>Response content</p></li>
</ul>
<p>Compared to wget running in mirror mode:</p>
<ul class="simple">
<li><p><strong>WARC</strong>: More comprehensive (preserves status codes and headers) but slower crawling</p></li>
<li><p><strong>wget mirror</strong>: Faster crawling but doesn’t preserve status codes or headers</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
</pre></div>
</div>
<p>Verify installation was successful:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
</section>
<section id="configure-claude-desktop">
<h3>2. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"warc"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"/path/to/warc/archives/"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
<li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
<li><p>Change <code class="docutils literal notranslate"><span class="pre">/path/to/warc/archives/</span></code> to your actual directory path where WARC files are stored</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="create-warc-files-with-wget">
<h3>3. Create WARC Files with Wget<a class="headerlink" href="#create-warc-files-with-wget" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Terminal (macOS) or Ubuntu/WSL (Windows)</p></li>
<li><p>Navigate to your target directory for storing WARC files</p></li>
<li><p>Run wget with WARC options:</p></li>
</ol>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># basic WARC capture</span>
wget<span class="w"> </span>--warc-file<span class="o">=</span>example<span class="w"> </span>--recursive<span class="w"> </span>https://example.com
<span class="c1"># more comprehensive capture with page requirements (CSS, images, etc.)</span>
wget<span class="w"> </span>--warc-file<span class="o">=</span>example<span class="w"> </span>--recursive<span class="w"> </span>--page-requisites<span class="w"> </span>https://example.com
</pre></div>
</div>
<p>Your WARC files will be created with a .warc.gz extension in your current directory.</p>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
<li><p>Ask Claude to list your crawled sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
</li>
<li><p>Try searching content from your crawls:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
<li><p>Check that your WARC directory path in the configuration is correct</p></li>
<li><p>Make sure your WARC files have the correct extension (typically .warc.gz)</p></li>
<li><p>Remember that the first time you use each function, Claude will ask for permission</p></li>
<li><p>For large WARC files, initial indexing may take some time</p></li>
</ul>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="siteone.html" class="btn btn-neutral float-left" title="SiteOne MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="wget.html" class="btn btn-neutral float-right" title="wget MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/guides/interrobot.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>InterroBot MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Katana MCP Setup Guide" href="katana.html" />
<link rel="prev" title="HTTrack MCP Setup Guide" href="httrack.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">InterroBot MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/interrobot.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="interrobot-mcp-setup-guide">
<h1>InterroBot MCP Setup Guide<a class="headerlink" href="#interrobot-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with InterroBot.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve crawled with InterroBot.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/55y8oKWXJLs" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p><a class="reference external" href="https://interro.bot">InterroBot</a> installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
</ul>
</section>
<section id="what-is-interrobot">
<h2>What is InterroBot?<a class="headerlink" href="#what-is-interrobot" title="Link to this heading"></a></h2>
<p>InterroBot is a commercial web crawler and analyzer that works seamlessly with mcp-server-webcrawl, providing several advantages:</p>
<ul class="simple">
<li><p>User-friendly graphical interface for managing crawls</p></li>
<li><p>Comprehensive data collection including page content and metadata</p></li>
<li><p>Natively indexed, no first search build lag</p></li>
<li><p>Cross-platform (Windows, macOS, Android)</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
</pre></div>
</div>
<p>Verify installation was successful:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
</section>
<section id="create-crawls-with-interrobot">
<h3>2. Create Crawls with InterroBot<a class="headerlink" href="#create-crawls-with-interrobot" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open InterroBot</p></li>
<li><p>For a new project, you’ll see an empty project screen</p></li>
<li><p>Add websites to crawl by entering URLs (e.g., example.com, pragmar.com)</p></li>
<li><p>Wait for the crawling to complete (typically takes a few seconds to minutes depending on site size)</p></li>
<li><p>Note the location of your InterroBot database file, which will be needed for configuration. It is available in InterroBot options, under Advanced section:
- On Windows: Typically in <code class="docutils literal notranslate"><span class="pre">[homedir]/Documents/InterroBot/interrobot.v2.db</span></code>
- On macOS: Path can be found in InterroBot settings page</p></li>
</ol>
</section>
<section id="configure-claude-desktop">
<h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"interrobot"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"[homedir]/Documents/InterroBot/interrobot.v2.db"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
<li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
<li><p>Replace <code class="docutils literal notranslate"><span class="pre">[homedir]/Documents/InterroBot/interrobot.v2.db</span></code> with the actual path to your InterroBot database file, available in InterroBot options</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
<li><p>Ask Claude to list your crawled sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
</li>
<li><p>Try searching content from your crawls:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
</pre></div>
</div>
</li>
<li><p>Explore specific capabilities, such as generating site reports:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you give me a file type summary for [crawled site]? Which types of files are there, page count, etc.
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
<li><p>Check that your InterroBot database path in the configuration is correct</p></li>
<li><p>Make sure InterroBot has successfully completed crawling the websites</p></li>
<li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
<li><p>For large websites with many pages, search queries might take longer to process initially</p></li>
</ul>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="httrack.html" class="btn btn-neutral float-left" title="HTTrack MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="katana.html" class="btn btn-neutral float-right" title="Katana MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_sources/usage.rst.txt:
--------------------------------------------------------------------------------
```
.. raw:: html
<style>
.wy-table-responsive table td, .wy-table-responsive table th {
white-space: normal !important;
}
table th.head {
font-size: 80%;
word-break: break-word;
}
table td, table th {
text-align: center;
vertical-align: middle;
}
table td:first-of-type, table th:first-of-type {
text-align: left;
white-space: nowrap !important;
}
</style>
Usage
=====
Once installed, **mcp-server-webcrawl** can leverage search and retrieval capabilities to pull
your website crawl data as needed, using advanced filtering. Use it to help manage your website,
as an on-demand resource database (marketing, SEO, etc.), or anything else.
The truth of the matter is, you don't need to know the API behind the MCP server, as it is
designed to be consumed by the LLM. It is, however, useful to understand for advanced use cases.
Available Tools
---------------
The API is *supposed* to stay out your way, and to a large degree
it can be navigated autonomously by your MCP client. Sometimes
you may need to nudge the LLM to the correct field or search strategy. The
following is the currect API interface for your reference.
webcrawl_sites
~~~~~~~~~~~~~~
This tool retrieves a list of sites (project websites or crawl directories).
.. list-table::
:header-rows: 1
:widths: 15 15 15 55
* - Parameter
- Type
- Required
- Description
* - ids
- array<int>
- No
- List of project IDs to retrieve. Leave empty for all projects.
* - fields
- array<string>
- No
- List of additional fields to include beyond defaults (id, type, name, urls). Empty list means default fields only. Options include created (ISO 8601), modified (ISO 8601).
webcrawl_search
~~~~~~~~~~~~~~~
This tool searches for resources (webpages, CSS, images, etc.) across projects and retrieves specified fields.
.. list-table::
:header-rows: 1
:widths: 15 15 15 55
* - Parameter
- Type
- Required
- Description
* - sites
- array<int>
- No
- Optional list of project IDs to filter search results to specific sites. In most scenarios, you'd filter to only one site.
* - query
- string
- No
- Fulltext search query string. Leave empty to return all resources when filtering on other fields for better precision. Supports fulltext and boolean operators (AND, OR, NOT), quoted phrases, and suffix wildcards, but not prefix wildcards. See below for complete boolean and field search capabilities.
* - fields
- array<string>
- No
- List of additional fields to include beyond defaults (modified, created). Empty list means default fields only. The content field can lead to large results and should be used with LIMIT.
* - sort
- string
- No
- Sort order for results. Prefixed with + for ascending, - for descending. ? is a special option for random sort, useful in statistical sampling. Options include: +id, -id, +url, -url, +status, -status, ?.
* - limit
- integer
- No
- Maximum number of results to return. Default is 20, max is 100.
* - offset
- integer
- No
- Number of results to skip for pagination. Default is 0.
* - extras
- array<string>
- No
- Array of extra features to include in results. Options include markdown, snippets, thumbnails, regex, and xpath. (see extras table)
* - extrasRegex
- array<string>
- No
- Array of regular expression patterns to extract content. One or more regex patterns can be requested. Only used when 'regex' is included in the extras array.
* - extrasXpath
- array<string>
- No
- Array of XPath expressions to extract specific content from HTML resources. One or more XPath selectors can be requested. Only used when 'xpath' is included in the extras array.
Crawler Features Support
~~~~~~~~~~~~~~~~~~~~~~~~
API support, by parameter, across crawler type.
.. list-table::
:header-rows: 1
:widths: 13 12 12 13 12 12 13 13
:class: featuresgrid
* - Parameter
- ArchiveBox
- HTTrack
- InterroBot
- Katana
- SiteOne
- WARC
- wget
* - Sites/ids
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Sites/fields
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Search/ids
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Search/sites
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Search/query
- ✔
- ✔
- ✔
- ✔
- ①
- ✔
- ①
* - Search/fields
- ✔
- ✔
- ✔
- ✔
- ②
- ✔
- ②
* - Search/sort
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Search/limit
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Search/offset
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - Search/extras
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
Crawler Field Support
~~~~~~~~~~~~~~~~~~~~~
API support, by field, across crawler type.
.. list-table::
:header-rows: 1
:widths: 13 12 12 13 12 12 13 13
:class: featuresgrid
* - Parameter
- ArchiveBox
- HTTrack
- InterroBot
- Katana
- SiteOne
- WARC
- wget
* - site.id
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - site.name
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - site.type
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - site.urls
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - resource.id
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - resource.url
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - resource.type
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - resource.status
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ③
* - resource.size
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
* - resource.headers
- ✔
- ✔
- ✔
- ✔
-
- ✔
-
* - resource.content
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
- ✔
①②③ wget (--mirror) does not index HTTP status beyond 200 OK (HTTP errors not saved to disk).
wget and SiteOne crawler implementations do not support field searchable HTTP headers. When used in
WARC mode (as opposed to simple mirror), wget is capable of collecting HTTP headers
and status.
Crawlers all have strengths and weaknesses, judge them on how well they
fit your needs, and don't be all that concerned over headers field support. They all
support fulltext boolean search across the crawl data.
Boolean Search Syntax
~~~~~~~~~~~~~~~~~~~~~
The query engine supports field-specific (``field: value``) searches and complex boolean
expressions. Fulltext is supported as a combination of the url, content, and headers fields.
While the API interface is designed to be consumed by the LLM directly, it can be helpful
to familiarize yourself with the search syntax. Searches generated by the LLM are
inspectable, but generally collapsed in the UI. If you need to see the query, expand
the MCP collapsable.
.. list-table::
:header-rows: 1
:widths: 30 70
* - Query Example
- Description
* - privacy
- fulltext single keyword match
* - "privacy policy"
- fulltext match exact phrase
* - boundar*
- fulltext wildcard matches results starting with *boundar* (boundary, boundaries)
* - id: 12345
- id field matches a specific resource by ID
* - url: example.com/*
- url field matches results with URL containing example.com/
* - type: html
- type field matches for HTML pages only
* - status: 200
- status field matches specific HTTP status codes (equal to 200)
* - status: >=400
- status field matches specific HTTP status code (greater than or equal to 400)
* - content: h1
- content field matches content (HTTP response body, often, but not always HTML)
* - headers: text/xml
- headers field matches HTTP response headers
* - privacy AND policy
- fulltext matches both
* - privacy OR policy
- fulltext matches either
* - policy NOT privacy
- fulltext matches policies not containing privacy
* - (login OR signin) AND form
- fulltext matches fullext login or signin with form
* - type: html AND status: 200
- fulltext matches only HTML pages with HTTP success
Field Search Definitions
~~~~~~~~~~~~~~~~~~~~~~~~
Field search provides search precision, allowing you to specify which columns of the search index to filter.
Rather than searching the entire content, you can restrict your query to specific attributes like URLs,
headers, or content body. This approach improves efficiency when looking for
specific attributes or patterns within crawl data.
.. list-table::
:header-rows: 1
:widths: 30 70
* - Field
- Description
* - id
- resource database ID
* - url
- resource URL
* - type
- enumerated list of types (see types table)
* - size
- resource size in bytes
* - status
- HTTP response codes
* - headers
- HTTP response headers
* - content
- HTTP body—HTML, CSS, JS, and more
Field Content
~~~~~~~~~~~~~
A subset of fields can be independently requested with results, while core fields are always on. Use of headers and content can consume tokens quickly. Use judiciously, or use extras to crunch more results into the context window. Fields are a top level argument, independent of any field searching taking place in the query.
.. list-table::
:header-rows: 1
:widths: 30 70
* - Field
- Description
* - id
- always available
* - url
- always available
* - type
- always available
* - status
- always available
* - created
- on request
* - modified
- on request
* - size
- on request
* - headers
- on request
* - content
- on request
Content Types
~~~~~~~~~~~~~
Crawls contain a multitude of resource types beyond HTML pages. The ``type:`` field search
allows filtering by broad content type groups, particularly useful when filtering images without complex extension queries.
For example, you might search for ``type: html NOT content: login``
to find pages without "login," or ``type: img`` to analyze image resources. The table below lists all
supported content types in the search system.
.. list-table::
:header-rows: 1
:widths: 30 70
* - Type
- Description
* - html
- webpages
* - iframe
- iframes
* - img
- web images
* - audio
- web audio files
* - video
- web video files
* - font
- web font files
* - style
- CSS stylesheets
* - script
- JavaScript files
* - rss
- RSS syndication feeds
* - text
- plain text content
* - pdf
- PDF files
* - doc
- MS Word documents
* - other
- uncategorized
Extras
~~~~~~
The ``extras`` parameter provides additional processing options, transforming result data (markdown, snippets), or connecting the LLM to external data (thumbnails). These options can be combined as needed to achieve the desired result format.
.. list-table::
:header-rows: 1
:widths: 20 80
* - Extra
- Description
* - thumbnails
- Generates base64 encoded images to be viewed and analyzed by AI models. Enables image description, content analysis, and visual understanding while keeping token output minimal. Works with images, which can be filtered using ``type: img`` in queries. SVG is not supported.
* - markdown
- Provides the HTML content field as concise Markdown, reducing token usage and improving readability for LLMs. Works with HTML, which can be filtered using ``type: html`` in queries.
* - snippets
- Matches fulltext queries to contextual keyword usage within the content. When used without requesting the content field (or markdown extra), it can provide an efficient means of refining a search without pulling down the complete page contents. Also great for rendering old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, or any text-based, crawled file.
* - regex
- Extracts regular expression matches from crawled files such as HTML, CSS, JavaScript, etc. Not as precise a tool as XPath for HTML, but supports any text file as a data source. One or more regex patterns can be requested, using the ``extrasRegex`` argument.
* - xpath
- Extracts XPath selector data, used in scraping HTML content. Use XPath's text() selector for text-only, element selectors return outerHTML. Only supported with ``type: html``, other types will be ignored. One or more XPath selectors (//h1, count(//h1), etc.) can be requested, using the ``extrasXpath`` argument.
Extras provide a means of producing token-efficient HTTP content responses. Markdown produces roughly 1/3 the bytes of the source HTML, snippets are generally 500 or so bytes per result, and XPath can be as specific or broad as you choose. The more focused your requests, the more results you can fit into your LLM session.
The idea, of course, is that the LLM takes care of this for you. If you notice your LLM developing an affinity to the "content" field (full HTML), a nudge in chat to budget tokens using the extras feature should be all that is needed.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/adapter.py:
--------------------------------------------------------------------------------
```python
import os
import re
import sqlite3
import traceback
from contextlib import closing
from datetime import datetime, timezone
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.adapter import (
BaseManager,
IndexState,
IndexStatus,
SitesGroup,
INDEXED_BATCH_SIZE,
INDEXED_RESOURCE_DEFAULT_PROTOCOL,
INDEXED_TYPE_MAPPING
)
from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
from mcp_server_webcrawl.models.resources import (
ResourceResult,
ResourceResultType,
RESOURCES_LIMIT_DEFAULT,
)
from mcp_server_webcrawl.models.sites import (
SiteResult,
SiteType,
)
from mcp_server_webcrawl.utils.logger import get_logger
HTTRACK_REGEX_LAUNCH_URL = re.compile(r"launched on .+ at (https?://[^\s]+)")
HTTRACK_REGEX_REDIRECT = re.compile(r"File has moved from (https?://[^\s]+) to (.+)")
HTTRACK_REGEX_ERROR = re.compile(r'"([^"]+)" \((\d+)\) at link (https?://[^\s]+)')
HTTRACK_REGEX_DOMAIN = re.compile(r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$')
HTTRACK_REGEX_INDEX_HTML = re.compile(r"/index\.html($|\?)")
logger = get_logger()
class HtTrackManager(IndexedManager):
"""
Manages HTTrack project data in in-memory SQLite databases.
"""
def __init__(self) -> None:
"""
Initialize the HTTrack manager with empty cache and statistics.
"""
super().__init__()
def _load_site_data(self, connection: sqlite3.Connection, project_directory: Path,
site_id: int, index_state: IndexState = None) -> None:
"""
Load an HTTrack project directory into the database.
Args:
connection: SQLite connection
project_dir: path to the HTTrack project directory
site_id: ID for the site
index_state: IndexState object for tracking progress
"""
if not project_directory.exists() or not project_directory.is_dir():
logger.error(f"Directory not found or not a directory: {project_directory}")
return
if index_state is not None:
index_state.set_status(IndexStatus.INDEXING)
# metadata from hts-log.txt
project_metadata = self._get_project_metadata(project_directory)
# domain directories discovery
domain_directories = self._get_content_directories(project_directory)
if not domain_directories:
logger.warning(f"No domain directories found in HTTrack project: {project_directory}")
return
httrack_skip_files_lower = ["hts-log.txt", "index.html"]
with closing(connection.cursor()) as cursor:
for domain_directory in domain_directories:
base_url = self._get_base_url(domain_directory, project_metadata)
file_paths = []
for root, _, files in os.walk(domain_directory):
for filename in files:
file_path = Path(root) / filename
if filename.lower() in httrack_skip_files_lower and file_path.parent == project_directory:
continue
file_paths.append(file_path)
# batch process
for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
if index_state is not None and index_state.is_timeout():
index_state.set_status(IndexStatus.PARTIAL)
return
batch_file_paths = file_paths[i:i+INDEXED_BATCH_SIZE]
batch_file_contents = BaseManager.read_files(batch_file_paths)
batch_insert_resource_results = []
for file_path in batch_file_paths:
content = batch_file_contents.get(file_path)
try:
result = self._create_resource(
file_path, site_id, domain_directory, base_url,
project_metadata, content
)
if result:
batch_insert_resource_results.append(result)
if index_state is not None:
index_state.increment_processed()
except Exception as ex:
logger.error(f"Error processing file {file_path}: {ex}")
self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
if index_state is not None and index_state.status == IndexStatus.INDEXING:
index_state.set_status(IndexStatus.COMPLETE)
def _create_resource(self, file_path: Path, site_id: int, domain_directory: Path,
base_url: str, project_metadata: dict, content: str = None) -> ResourceResult | None:
"""
Create ResourceResult for an HTTrack file.
Args:
file_path: path to the file
site_id: ID for the site
domain_dir: path to the domain directory
base_url: reconstructed base URL for the domain
project_metadata: extracted project metadata
content: optional pre-loaded file content
Returns:
ResourceResult object ready for insertion, or None if processing fails
"""
try:
relative_path: Path = file_path.relative_to(domain_directory)
url = base_url + str(relative_path).replace(os.sep, "/")
# Handle homepage index.html like wget does
url = HTTRACK_REGEX_INDEX_HTML.sub(r"/\1", url)
# Determine resource type from file extension
extension = file_path.suffix.lower()
resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
# Get file metadata
if file_path.is_file():
file_stat = file_path.stat()
file_size = file_stat.st_size
file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
else:
file_created = None
file_modified = None
file_size = 0
status_code = 200 # Default for files that exist
errors = project_metadata.get("errors", {})
redirects = project_metadata.get("redirects", {})
if url in errors:
status_code = errors[url]
elif url in redirects:
status_code = 302 # Assume redirect
# pre-loaded content if available
file_content = content
if file_content is None:
file_content = BaseManager.read_file_contents(file_path, resource_type)
return ResourceResult(
id=BaseManager.string_to_id(url),
site=site_id,
created=file_created,
modified=file_modified,
url=url,
type=resource_type,
status=status_code,
headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
content=file_content,
size=file_size,
time=0 # data unavailable (HTTrack)
)
except Exception as ex:
logger.error(f"Error creating resource for file {file_path}: {ex}\n{traceback.format_exc()}")
return None
def _get_project_metadata(self, project_directory: Path) -> dict[str, str]:
"""
Get metadata from HTTrack hts-log.txt file.
Args:
project_dir: path to the HTTrack project directory
Returns:
Dictionary containing extracted metadata (urls, launch_time, etc.)
"""
metadata: dict = {}
hts_log_path: Path = project_directory / "hts-log.txt"
if not hts_log_path.exists():
logger.warning(f"No hts-log.txt found in {project_directory}")
return metadata
# into fragile territory, if in doubt follow latest official HTTrack
try:
with open(hts_log_path, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
# extract primary network domain (http) from first line
launch_match = HTTRACK_REGEX_LAUNCH_URL.search(content)
if launch_match:
metadata["launch_url"] = launch_match.group(1)
redirects = {}
errors = {}
for line in content.split("\n"):
line = line.strip()
# redirects - file has moved from X to Y
redirect_match = HTTRACK_REGEX_REDIRECT.search(line)
if redirect_match:
redirects[redirect_match.group(1)] = redirect_match.group(2)
# errors - Not Found (404) at link X
error_match = HTTRACK_REGEX_ERROR.search(line)
if error_match:
error_text, status_code, url = error_match.groups()
errors[url] = int(status_code)
metadata["redirects"] = redirects
metadata["errors"] = errors
except (FileNotFoundError, PermissionError, UnicodeDecodeError) as ex:
logger.warning(f"Could not read hts-log.txt from {project_directory}: {ex}")
except Exception as ex:
logger.error(f"Error parsing hts-log.txt from {project_directory}: {ex}")
return metadata
def _get_content_directories(self, project_directory: Path) -> list[Path]:
"""
Get domain directories within an HTTrack project.
Args:
project_dir: path to the HTTrack project directory
Returns:
List of domain directory paths
"""
content_directories: list[Path] = []
for item in project_directory.iterdir():
if (item.is_dir() and
not item.name.startswith(".") and
item.name not in ["hts-cache", "hts-tmp"] and
not item.name.startswith("hts-")):
# if directory contains web content (has HTML, CSS, JS, or image files)
has_web_content = any(
file_path.suffix.lower() in [".html", ".htm", ".css", ".js", ".png", ".jpg", ".gif"]
for file_path in item.rglob("*") if file_path.is_file()
)
if has_web_content:
content_directories.append(item)
return content_directories
def _get_base_url(self, domain_directory: Path, project_metadata: dict) -> str:
"""
Get the base URL for a domain directory.
Args:
domain_dir: path to the domain directory
project_metadata: extracted project metadata
Returns:
Reconstructed base URL
"""
# use launch URL if match
if "launch_url" in project_metadata:
launch_url = project_metadata["launch_url"]
try:
from urllib.parse import urlparse
parsed = urlparse(launch_url)
if parsed.netloc.replace("www.", "") == domain_directory.name.replace("www.", ""):
return f"{parsed.scheme}://{parsed.netloc}/"
except Exception:
pass
# if domain_directory name looks like a domain
if HTTRACK_REGEX_DOMAIN.match(domain_directory.name):
return f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{domain_directory.name}/"
# fallback
project_name = domain_directory.parent.name
logger.warning(f"Could not determine domain for {domain_directory}, using fallback: {project_name}")
return f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{project_name}.local/{domain_directory.name}/"
manager: HtTrackManager = HtTrackManager()
def get_sites(
datasrc: Path,
ids: list[int] | None = None,
fields: list[str] | None = None
) -> list[SiteResult]:
"""
List HTTrack project directories as sites.
Args:
datasrc: path to the directory containing HTTrack projects
ids: optional list of site IDs to filter by
fields: optional list of fields to include in the response
Returns:
List of SiteResult objects, one for each HTTrack project
"""
return manager.get_sites_for_directories(datasrc, ids, fields)
def get_resources(
datasrc: Path,
sites: list[int] | None = None,
query: str = "",
fields: list[str] | None = None,
sort: str | None = None,
limit: int = RESOURCES_LIMIT_DEFAULT,
offset: int = 0,
) -> tuple[list[ResourceResult], int, IndexState]:
"""
Get resources from HTTrack project directories using in-memory SQLite.
Args:
datasrc: path to the directory containing HTTrack projects
sites: optional list of site IDs to filter by
query: search query string
fields: optional list of fields to include in response
sort: sort order for results
limit: maximum number of results to return
offset: number of results to skip for pagination
Returns:
Tuple of (list of ResourceResult objects, total count, IndexState)
"""
sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
assert sites_results, "At least one site is required to search"
site_paths = [site.path for site in sites_results]
sites_group = SitesGroup(datasrc, sites, site_paths)
return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
```
--------------------------------------------------------------------------------
/docs/guides/httrack.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>HTTrack MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="InterroBot MCP Setup Guide" href="interrobot.html" />
<link rel="prev" title="ArchiveBox MCP Setup Guide" href="archivebox.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">HTTrack MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/httrack.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="httrack-mcp-setup-guide">
<h1>HTTrack MCP Setup Guide<a class="headerlink" href="#httrack-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with <a class="reference external" href="https://www.httrack.com/">HTTrack Website Copier</a>.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve mirrored using HTTrack.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/HAVfvmrZjRk" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p><a class="reference external" href="https://www.httrack.com/">HTTrack Website Copier</a> installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
</ul>
</section>
<section id="what-is-httrack">
<h2>What is HTTrack?<a class="headerlink" href="#what-is-httrack" title="Link to this heading"></a></h2>
<p>HTTrack is a well-established open source website mirror tool that offers:</p>
<ul class="simple">
<li><p>Complete website mirroring with organized project directories</p></li>
<li><p>User-friendly wizard-style interface for setup</p></li>
<li><p>Comprehensive content capture including HTML, CSS, images, and other assets</p></li>
<li><p>Ability to manage multiple site mirrors efficiently</p></li>
<li><p>Cross-platform support (Windows, macOS, Linux)</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
</pre></div>
</div>
<p>Verify installation was successful:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
</section>
<section id="create-website-mirrors-with-httrack">
<h3>2. Create Website Mirrors with HTTrack<a class="headerlink" href="#create-website-mirrors-with-httrack" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open HTTrack Website Copier application</p></li>
<li><p>Create a new project (e.g., “example”) and specify where to save it</p></li>
<li><p>Add the URL you want to mirror (e.g., <a class="reference external" href="https://example.com">https://example.com</a>)</p></li>
<li><p>Use the wizard interface to configure your crawling options</p></li>
<li><p>Start the mirroring process and wait for completion</p></li>
<li><p>Repeat for additional sites as needed (e.g., create another project for pragmar.com)</p></li>
</ol>
<p>HTTrack will create organized project directories under your specified location (typically “My Web Sites” on Windows or “websites” on macOS/Linux). Each project contains the complete website mirror with all HTML files, images, CSS, and other assets properly organized.</p>
</section>
<section id="configure-claude-desktop">
<h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"httrack"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"/path/to/httrack/projects/"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On macOS/Linux, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>), and the default path is typically <code class="docutils literal notranslate"><span class="pre">"~/websites"</span></code></p></li>
<li><p>The datasrc path should point to your HTTrack project directory containing all your mirrored sites</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
<li><p>Ask Claude to list your crawled sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
</li>
<li><p>Try searching content from your crawls:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
</pre></div>
</div>
</li>
<li><p>Conduct content audits and SEO analysis:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you analyze the content structure and SEO elements for [crawled site]?
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
<li><p>Check that your HTTrack project directory path in the configuration is correct</p></li>
<li><p>Make sure HTTrack has successfully completed mirroring the websites and created the project directories</p></li>
<li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
<li><p>For large websites, initial indexing may take some time during the first search</p></li>
</ul>
<p>HTTrack’s project structure makes it easy to manage multiple site mirrors, and when combined with mcp-server-webcrawl, provides for content analysis, SEO audits, and searchable archives.</p>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="archivebox.html" class="btn btn-neutral float-left" title="ArchiveBox MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="interrobot.html" class="btn btn-neutral float-right" title="InterroBot MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/extras/xpath.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.extras.xpath — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.extras.xpath</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.extras.xpath</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">lxml.html</span>
<span class="kn">from</span> <span class="nn">lxml</span> <span class="kn">import</span> <span class="n">etree</span>
<span class="kn">from</span> <span class="nn">lxml.etree</span> <span class="kn">import</span> <span class="n">ParserError</span><span class="p">,</span> <span class="n">XPathEvalError</span><span class="p">,</span> <span class="n">XPathSyntaxError</span>
<span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="get_xpath">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.xpath.get_xpath">[docs]</a>
<span class="k">def</span> <span class="nf">get_xpath</span><span class="p">(</span><span class="n">content</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">xpaths</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">float</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Takes content and gets xpath hits</span>
<span class="sd"> Arguments:</span>
<span class="sd"> content: The HTML source</span>
<span class="sd"> xpaths: The xpath selectors</span>
<span class="sd"> Returns:</span>
<span class="sd"> A list of dicts, with selector and value</span>
<span class="sd"> """</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">content</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">xpaths</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">xpaths</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"xpaths must be a list of strings"</span><span class="p">)</span>
<span class="n">results</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="n">content</span> <span class="o">==</span> <span class="s2">""</span><span class="p">:</span>
<span class="k">return</span> <span class="n">results</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">doc</span><span class="p">:</span> <span class="n">lxml</span><span class="o">.</span><span class="n">html</span><span class="o">.</span><span class="n">HtmlElement</span> <span class="o">=</span> <span class="n">lxml</span><span class="o">.</span><span class="n">html</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">content</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">))</span>
<span class="k">except</span> <span class="n">ParserError</span><span class="p">:</span>
<span class="k">return</span> <span class="n">results</span>
<span class="k">for</span> <span class="n">xpath</span> <span class="ow">in</span> <span class="n">xpaths</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">selector_result</span> <span class="o">=</span> <span class="n">doc</span><span class="o">.</span><span class="n">xpath</span><span class="p">(</span><span class="n">xpath</span><span class="p">)</span>
<span class="k">except</span> <span class="p">(</span><span class="n">XPathEvalError</span><span class="p">,</span> <span class="n">XPathSyntaxError</span><span class="p">)</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Invalid xpath '</span><span class="si">{</span><span class="n">xpath</span><span class="si">}</span><span class="s2">': </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
<span class="k">continue</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">selector_result</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="c1"># normal xpath query returns a list</span>
<span class="k">for</span> <span class="n">result</span> <span class="ow">in</span> <span class="n">selector_result</span><span class="p">:</span>
<span class="c1"># a new dict for each result</span>
<span class="n">xpath_hit</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="s2">"selector"</span><span class="p">:</span> <span class="n">xpath</span><span class="p">}</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="s2">"tag"</span><span class="p">):</span>
<span class="n">html_string</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">tostring</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">"unicode"</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s2">"html"</span><span class="p">)</span>
<span class="n">xpath_hit</span><span class="p">[</span><span class="s2">"value"</span><span class="p">]</span> <span class="o">=</span> <span class="n">html_string</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">xpath_hit</span><span class="p">[</span><span class="s2">"value"</span><span class="p">]</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">result</span><span class="p">)</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="n">results</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">xpath_hit</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># single value case (count(//h1), sum(), etc.) is also valid xpath</span>
<span class="n">xpath_hit</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="s2">"selector"</span><span class="p">:</span> <span class="n">xpath</span><span class="p">}</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">selector_result</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">)):</span>
<span class="n">xpath_hit</span><span class="p">[</span><span class="s2">"value"</span><span class="p">]</span> <span class="o">=</span> <span class="n">selector_result</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">xpath_hit</span><span class="p">[</span><span class="s2">"value"</span><span class="p">]</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">selector_result</span><span class="p">)</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="n">results</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">xpath_hit</span><span class="p">)</span>
<span class="k">return</span> <span class="n">results</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/prompts/testsearch.md:
--------------------------------------------------------------------------------
```markdown
# mcp-server-webcrawl Boolean Search Self-Test Instructions
## Query Sequence
### 1. Identify Target Domain & Homepage
**FIRST:** Get available sites and let user choose:
```
webcrawl_sites() - get all available domains
```
**THEN:** Find homepage with sorted URL approach:
```
query: type: html AND url: [target_site_domain]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
**Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
### 2. Extract Boolean Test Terms from Homepage
**Scan homepage markdown to identify:**
- **High-frequency keywords:** Terms appearing multiple times (good for large result sets)
- **Unique/distinctive terms:** Terms likely appearing on fewer pages (good for small result sets)
- **Exact phrases:** Multi-word phrases in quotes (good for phrase matching tests)
- **Technical terms:** Domain-specific vocabulary that should appear consistently
**Select test term strategy:**
- **Term A (Common):** High-frequency keyword likely appearing on 10+ pages
- **Term B (Specific):** Lower-frequency keyword likely appearing on 3-8 pages
- **Phrase C:** Exact phrase in quotes for phrase matching validation
- **Term D (Rare):** Very specific term likely appearing on 1-3 pages
*When selecting test terms, avoid combinations where a term is a subset of a phrase, as this will lead to suspicious counts during AND operation testing. Avoid terms in global navigation and header/footer listed on every page, the result counts will all look the same.*
### 3. Establish Baseline Counts
**Test each term individually to establish baseline sets:**
```
query: [term_a]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_b]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: "[phrase_c]"
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_d]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
**Record baseline totals and document which pages contain each term for mathematical validation.**
In the case of a missing, but expected keyword, or a present but unanticipated keyword in the markdown,
verify with complete picture (HTTP headers and content) of the same document. The default fulltext search
MATCHes URL, headers, and content. It is capable of producing false positives against markdown. Generally,
it works fine and saves tokens and time to use the Markdown strategy.
```
query: id: [document_id]
limit: 1
sites: [target_site_id]
fields: ["content", "headers"]
extras: ["markdown"]
sort: +url
```
### 4. Boolean Logic Validation Tests
**Execute tests in this specific order for mathematical verification:**
#### 4.1: AND Operations (Intersection Tests)
```
query: [term_a] AND [term_b]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_a] AND "[phrase_c]"
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_b] AND [term_d]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
**Validation:** AND results must be ≤ smallest individual term count. Verify content contains both terms.
#### 4.2: OR Operations (Union Tests)
```
query: [term_a] OR [term_b]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_b] OR [term_d]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: "[phrase_c]" OR [term_d]
limit: 1
sites: [target_site_id]
sort: +url
```
**Validation:** OR results must be ≥ largest individual term count. Verify content contains at least one term.
#### 4.3: NOT Operations (Difference Tests)
```
query: [term_a] NOT [term_b]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_b] NOT [term_a]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_a] NOT "[phrase_c]"
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
**Validation:** NOT results = (Term1 count) - (Term1 AND Term2 count). Verify content contains first term but not second.
#### 4.4: Field/NOT Operations (Dynamic Field Exclusion Tests)
**Test field/NOT syntax using established terms across different field types:**
```
query: type: html NOT content: [term_b]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_a] NOT type: img
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: status: 200 NOT type: script
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
**Validation:** Field/NOT must exclude based on field-specific content, not fulltext matching. Critical parser test distinguishing `content` (fulltext) vs `content: value` (field search).
**Mathematical Check:**
- A NOT B = A - (A AND B)
- Verify field-specific exclusions follow set theory rules
- Sample results to confirm field syntax working, not fulltext fallback
### 5. Complex Boolean Expression Tests
**Test operator precedence and grouping:**
```
query: [term_a] OR [term_b] AND [term_d]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: ([term_a] OR [term_b]) AND [term_d]
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
```
query: [term_a] AND ([term_b] OR [term_d])
limit: 1
sites: [target_site_id]
fields: []
extras: ["markdown"]
sort: +url
```
**Validation:** Verify operator precedence follows standard search engine convention
(AND before OR) and parentheses precedence correctly.
Before declaring precedence failure, verify the mathematics:
For query: [term_a] OR [term_b] AND [term_d]
Expected parsing: [term_a] OR ([term_b] AND [term_d])
1. Calculate: [term_b] AND [term_d] = X results
2. Calculate: [term_a] OR X should ≈ [term_a] baseline (if [term_a] >> X)
3. If actual results ≈ [term_a] baseline, precedence is CORRECT
4. Only flag as error if mathematics don't match expected precedence
### 6. Content Verification Sampling
**For critical tests, verify content accuracy by sampling full HTTP results:**
Content can be large,
```
query: id: [document_id]
fields: ["content", "headers"]
sites: [target_site_id]
limit: 1
```
**Check 2-3 results from each boolean operation to ensure:**
- AND results actually contain both terms
- OR results contain at least one term
- NOT results contain first term but exclude second term
### 7. Mathematical Consistency Validation
**For each test combination, verify set theory compliance:**
| Operation | Formula | Expected Result |
|-----------|---------|----------------|
| A AND B | Intersection | ≤ min(A, B) |
| A OR B | Union | ≥ max(A, B), ≤ A + B |
| A NOT B | Difference | A - (A AND B) |
| NOT (A AND B) | De Morgan's Law | (NOT A) OR (NOT B) |
| NOT (A OR B) | De Morgan's Law | (NOT A) AND (NOT B) |
### 8. Offer Advanced Analysis or Tool Research
After completing the main boolean audit, offer the user two additional options:
- **Detailed Analysis:** More comprehensive investigation of search performance, edge cases, or complex query patterns
- **Tool Research:** Research and recommend specific tools for search optimization, query debugging, or search analytics
## Boolean Test Methodology
### Term Selection Strategy
#### High-Value Test Terms
- **Common terms (10+ pages):** Good for testing large set operations and performance
- **Specific terms (3-8 pages):** Ideal for precise mathematical validation
- **Rare terms (1-3 pages):** Perfect for edge case testing and NOT operations
- **Exact phrases:** Critical for phrase matching and quote handling validation
- **Avoid these terms** Avoid keywords that exist in the URL, and words associated with common HTTP headers (application/etc.).
#### Mathematical Rigor Requirements
- **Intersection tests:** Verify A AND B ≤ min(A, B)
- **Union tests:** Verify max(A, B) ≤ A OR B ≤ A + B
- **Difference tests:** Verify A NOT B = A - (A AND B)
- **Content validation:** Sample results to confirm logical operators work on actual content
### Test Execution Order
#### Phase 1: Baseline Establishment
1. Extract test terms from homepage content analysis
2. Execute individual term searches to establish baseline counts
3. Document which pages contain which terms for cross-reference
#### Phase 2: Core Boolean Logic
1. Test AND operations (intersection logic)
2. Test OR operations (union logic)
3. Test NOT operations (difference logic)
4. Verify mathematical relationships for each operation
#### Phase 3: Complex Expression Validation
1. Test operator precedence without parentheses
2. Test explicit parentheses grouping
3. Test nested boolean expressions
4. Verify complex query parsing accuracy
#### Phase 4: Content Verification
1. Sample results from each boolean operation type
2. Verify content actually matches boolean logic expectations
3. Test edge cases and boundary conditions
4. Confirm search index accuracy
## Common Boolean Logic Issues
### High Priority Issues
1. **Incorrect AND logic:** Results contain only one term instead of both
2. **Broken NOT logic:** Results include excluded terms or miss included terms
3. **Mathematical inconsistency:** Set operations don't follow mathematical rules
4. **Phrase matching failures:** Quoted phrases not treated as exact matches
5. **Operator precedence errors:** Complex queries parsed incorrectly
### Medium Priority Issues
1. **Performance degradation:** Complex boolean queries significantly slower
2. **Case sensitivity problems:** Inconsistent handling of term capitalization
3. **Partial word matching:** "test" matching "testing" when exact match expected
4. **Whitespace handling:** Extra spaces breaking phrase matches
5. **Special character issues:** Boolean operators in content causing conflicts
### Low Priority Issues
1. **Optimization opportunities:** Redundant query patterns that could be simplified
2. **Result ordering consistency:** Same logical query returning different sort orders
3. **Marginal performance improvements:** Small optimizations for complex queries
## Reporting Template
### 📊 Boolean Search Logic Summary
| Test Category | Tests Executed | Passed | Failed | Critical Issues |
|---------------|----------------|--------|--------|-----------------|
| **Baseline Terms** | 4 | X | Y | Missing/incorrect baselines |
| **AND Operations** | 3 | X | Y | Intersection failures |
| **OR Operations** | 3 | X | Y | Union calculation errors |
| **NOT Operations** | 3 | X | Y | Difference logic broken |
| **Complex Expressions** | 3 | X | Y | Precedence/grouping issues |
| **Content Verification** | 3 | X | Y | Logic vs content mismatch |
### 🔍 Test Term Analysis
| Term | Type | Baseline Count | Pages Sampled | Content Accuracy |
|------|------|---------------|---------------|------------------|
| [term_a] | Common | X pages | Y pages | ✅ Accurate |
| [term_b] | Specific | X pages | Y pages | ✅ Accurate |
| "[phrase_c]" | Exact Phrase | X pages | Y pages | ⚠️ Partial matches |
| [term_d] | Rare | X pages | Y pages | ❌ Missing content |
### ⚡ Boolean Logic Validation Matrix
| Operation | Query | Expected | Actual | Mathematical Check | Content Check | Status |
|-----------|-------|----------|--------|-------------------|---------------|--------|
| AND | [term_a] AND [term_b] | ≤ min(X,Y) | Z | ✅ Valid | ✅ Accurate | Pass |
| OR | [term_a] OR [term_b] | ≥ max(X,Y) | Z | ✅ Valid | ✅ Accurate | Pass |
| NOT | [term_a] NOT [term_b] | X - (A∩B) | Z | ❌ Invalid | ⚠️ Partial | Fail |
### 🧮 Mathematical Consistency Analysis
**Set Theory Validation:**
- **Intersection (AND):** All results ≤ smallest baseline ✅
- **Union (OR):** All results ≥ largest baseline ✅
- **Difference (NOT):** Calculations match A - (A∩B) formula ❌
- **Complex expressions:** Parentheses and precedence working ⚠️
**Critical Formula Checks:**
```
Term A: X pages
Term B: Y pages
A AND B: Z pages (Expected: ≤ min(X,Y)) [✅/❌]
A OR B: W pages (Expected: ≥ max(X,Y), ≤ X+Y) [✅/❌]
A NOT B: V pages (Expected: X - Z) [✅/❌]
```
### 📋 Content Verification Results
| Boolean Type | Sample Size | Content Accuracy | Common Issues |
|--------------|-------------|------------------|---------------|
| **AND Results** | 3 pages | 100% | None detected |
| **OR Results** | 3 pages | 67% | Missing term in 1 result |
| **NOT Results** | 3 pages | 33% | Excluded term found in 2 results |
### 🎯 Priority Fix Recommendations
| Priority | Issue | Impact | Fix Complexity |
|----------|-------|--------|----------------|
| **🚨 Critical** | NOT logic returns incorrect results | Search reliability | High - Core logic |
| **🔴 High** | AND missing term in results | User trust | Medium - Index update |
| **🟡 Medium** | Phrase matching inconsistent | Search precision | Low - Config change |
| **🟢 Low** | Performance optimization | User experience | Low - Query tuning |
## Methodology
You will review this search system from the perspective of an accomplished but patient web developer. You know that boolean logic is where the serious users separate the tools from the toys. You recognize that pure nested Boolean can't always be mapped one-to-one with sqlite FTS5 MATCH rules defining one MATCH per column. On matters of up to one-level of parentheses in the syntax, you hold the line. Boolean truth must prevail.
Your analysis will highlight both mathematical accuracy and practical usability. When boolean logic fails, you'll present the issues constructively, focusing on what failed, and potential solutions. Don't force solutions when the cause is not clear. The goal is to test Boolean support rigrorously, and highlight discrepencies when found.
Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/adapter.py:
--------------------------------------------------------------------------------
```python
import os
import re
import sqlite3
from contextlib import closing
from datetime import datetime, timezone
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.adapter import (
BaseManager,
IndexState,
IndexStatus,
SitesGroup,
INDEXED_BATCH_SIZE,
INDEXED_BYTE_MULTIPLIER,
INDEXED_RESOURCE_DEFAULT_PROTOCOL,
INDEXED_TYPE_MAPPING,
)
from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
from mcp_server_webcrawl.utils.logger import get_logger
from mcp_server_webcrawl.models.resources import (
ResourceResult,
ResourceResultType,
RESOURCES_LIMIT_DEFAULT,
)
from mcp_server_webcrawl.models.sites import (
SiteResult,
)
SITEONE_LOG_TYPE_MAPPING = {
"html": ResourceResultType.PAGE,
"redirect": ResourceResultType.PAGE,
"image": ResourceResultType.IMAGE,
"js": ResourceResultType.SCRIPT,
"css": ResourceResultType.CSS,
"video": ResourceResultType.VIDEO,
"audio": ResourceResultType.AUDIO,
"pdf": ResourceResultType.PDF,
"other": ResourceResultType.OTHER,
"font": ResourceResultType.OTHER,
}
logger = get_logger()
class SiteOneManager(IndexedManager):
"""
Manages SiteOne directory data in in-memory SQLite databases.
Wraps wget archive format (shared by SiteOne and wget)
Provides connection pooling and caching for efficient access.
"""
def __init__(self) -> None:
"""Initialize the SiteOne manager with empty cache and statistics."""
super().__init__()
def _extract_log_metadata(self, directory: Path) -> tuple[dict, dict]:
"""
Extract metadata from SiteOne log files.
Args:
directory: path to the site directory
Returns:
Tuple of (success log data, error log data) dictionaries
"""
directory_name: str = directory.name
log_data = {}
log_http_error_data = {}
log_pattern: str = f"output.{directory_name}.*.txt"
log_files = list(Path(directory.parent).glob(log_pattern))
if not log_files:
return log_data, log_http_error_data
log_latest = max(log_files, key=lambda p: p.stat().st_mtime)
try:
with open(log_latest, "r", encoding="utf-8") as log_file:
for line in log_file:
parts = [part.strip() for part in line.split("|")]
if len(parts) == 10:
parts_path = parts[3].split("?")[0]
try:
status = int(parts[4])
url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{directory_name}{parts_path}"
time_str = parts[6].split()[0]
time = int(float(time_str) * (1000 if "s" in parts[6] else 1))
# size collected for errors, os stat preferred
size_str = parts[7].strip()
size = 0
if size_str:
size_value = float(size_str.split()[0])
size_unit = size_str.split()[1].lower() if len(size_str.split()) > 1 else "b"
multiplier = INDEXED_BYTE_MULTIPLIER.get(size_unit, 1)
size = int(size_value * multiplier)
if 400 <= status < 600:
log_http_error_data[url] = {
"status": status,
"type": parts[5].lower(),
"time": time,
"size": size,
}
else:
log_data[url] = {
"status": status,
"type": parts[5].lower(),
"time": time,
"size": size,
}
except (ValueError, IndexError, UnicodeDecodeError, KeyError):
continue
elif line.strip() == "Redirected URLs":
# stop processing we're through HTTP requests
break
except Exception as ex:
logger.error(f"Error processing log file {log_latest}: {ex}")
return log_data, log_http_error_data
def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
site_id: int, index_state: IndexState = None) -> None:
"""
Load a SiteOne directory into the database with parallel processing and batch insertions.
Args:
connection: SQLite connection
directory: path to the SiteOne directory
site_id: ID for the site
index_state: IndexState object for tracking progress
"""
if not directory.exists() or not directory.is_dir():
logger.error(f"Directory not found or not a directory: {directory}")
return
if index_state is not None:
index_state.set_status(IndexStatus.INDEXING)
log_data, log_http_error_data = self._extract_log_metadata(directory)
file_paths = []
for root, _, files in os.walk(directory):
for filename in files:
if filename == "robots.txt" or (filename.startswith("output.") and filename.endswith(".txt")):
continue
file_paths.append(Path(root) / filename)
processed_urls = set()
with closing(connection.cursor()) as cursor:
for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
if index_state is not None and index_state.is_timeout():
index_state.set_status(IndexStatus.PARTIAL)
return
batch_paths = file_paths[i:i+INDEXED_BATCH_SIZE]
batch_insert_crawled: list[ResourceResult] = []
file_contents = BaseManager.read_files(batch_paths)
for file_path in batch_paths:
try:
result: ResourceResult | None = self._prepare_siteone_record(file_path,
site_id, directory, log_data, file_contents.get(file_path))
if result and result.url not in processed_urls:
batch_insert_crawled.append(result)
processed_urls.add(result.url)
if index_state is not None:
index_state.increment_processed()
except Exception as ex:
logger.error(f"Error processing file {file_path}: {ex}")
self._execute_batch_insert(connection, cursor, batch_insert_crawled)
# HTTP errors not already processed
batch_insert_errors: list[ResourceResult] = []
for url, meta in log_http_error_data.items():
if url not in processed_urls:
size = meta.get("size", 0)
result = ResourceResult(
id=BaseManager.string_to_id(url),
site=site_id,
url=url,
type=ResourceResultType.OTHER,
status=meta["status"],
headers=BaseManager.get_basic_headers(size, ResourceResultType.OTHER, file_path),
content="", # no content
size=size, # size from log
time=meta["time"]
)
batch_insert_errors.append(result)
if index_state is not None:
index_state.increment_processed()
# errors in batches too
if len(batch_insert_errors) >= INDEXED_BATCH_SIZE:
self._execute_batch_insert(connection, cursor, batch_insert_errors)
# insert any remaining error records
if batch_insert_errors:
self._execute_batch_insert(connection, cursor, batch_insert_errors)
if index_state is not None and index_state.status == IndexStatus.INDEXING:
index_state.set_status(IndexStatus.COMPLETE)
def _prepare_siteone_record(self, file_path: Path, site_id: int, base_dir: Path,
log_data: dict, content: str = None) -> ResourceResult | None:
"""
Prepare a record for batch insertion from a SiteOne file.
Args:
file_path: path to the file
site_id: id for the site
base_dir: base directory for the capture
log_data: dictionary of metadata from logs keyed by URL
content: optional pre-loaded file content
Returns:
Tuple of (record tuple, URL) or None if processing fails
"""
try:
# generate relative url path from file path (similar to wget)
relative_path = file_path.relative_to(base_dir)
url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{base_dir.name}/{str(relative_path).replace(os.sep, '/')}"
if file_path.is_file():
file_stat = file_path.stat()
file_size = file_stat.st_size
file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
else:
file_created = None
file_modified = None
file_size = 0
decruftified_path = BaseManager.decruft_path(str(file_path))
extension = Path(decruftified_path).suffix.lower()
wget_static_pattern = re.compile(r"\.[0-9a-f]{8,}\.")
# look up metadata from log if available, otherwise use defaults
metadata = None
wget_aliases = list(set([
url, # exact match first
re.sub(wget_static_pattern, ".", url), # static pattern
url.replace(".html", ""), # file without extension (redirects)
url.replace(".html", "/"), # directory style (targets)
url.replace("index.html", ""), # index removal
]))
for wget_alias in wget_aliases:
metadata = log_data.get(wget_alias, None)
if metadata is not None:
break
if metadata is not None:
# preventing duplicate html pages ./appstat.html and ./appstat/index.html
# prefer index.html (actual content) over redirect stubs
canonical_url = None
# Sort aliases to prefer index.html files over redirect stubs
sorted_aliases = sorted([alias for alias in wget_aliases if log_data.get(alias) == metadata],
key=lambda x: (not x.endswith('index.html'), x))
if sorted_aliases:
canonical_url = sorted_aliases[0] # Take the preferred one
url = canonical_url
else:
metadata = {}
status_code = metadata.get("status", 200)
response_time = metadata.get("time", 0)
log_type = metadata.get("type", "").lower()
if log_type:
# no type for redirects, but more often than not
# redirection to another page
resource_type = SITEONE_LOG_TYPE_MAPPING.get(log_type,
INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER))
else:
# fallback to extension-based mapping
resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
file_content = content
if file_content is None:
file_content = BaseManager.read_file_contents(file_path, resource_type)
# skip redirect stub files left in SiteOne archive (duplicate, wait for real content)
if status_code == 200 and file_content and '<meta http-equiv="refresh" content="0' in file_content:
return None
record = ResourceResult(
id=BaseManager.string_to_id(url),
site=site_id,
created=file_created,
modified=file_modified,
url=url,
type=resource_type,
status=status_code,
headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
content=file_content,
size=file_size,
time=response_time # possibly from log
)
return record
except Exception as ex:
logger.error(f"Error preparing record for file {file_path}: {ex}")
return None
manager: SiteOneManager = SiteOneManager()
def get_sites(
datasrc: Path,
ids: list[int] | None = None,
fields: list[str] | None = None
) -> list[SiteResult]:
"""
List site directories in the datasrc directory as sites.
Args:
datasrc: path to the directory containing site subdirectories
ids: optional list of site IDs to filter by
fields: optional list of fields to include in the response
Returns:
List of SiteResult objects, one for each site directory
Notes:
Returns an empty list if the datasrc directory doesn't exist.
"""
return manager.get_sites_for_directories(datasrc, ids, fields)
def get_resources(
datasrc: Path,
sites: list[int] | None = None,
query: str = "",
fields: list[str] | None = None,
sort: str | None = None,
limit: int = RESOURCES_LIMIT_DEFAULT,
offset: int = 0,
) -> tuple[list[ResourceResult], int, IndexState]:
"""
Get resources from wget directories using in-memory SQLite.
Args:
datasrc: path to the directory containing wget captures
sites: optional list of site IDs to filter by
query: search query string
fields: optional list of fields to include in response
sort: sort order for results
limit: maximum number of results to return
offset: number of results to skip for pagination
Returns:
Tuple of (list of ResourceResult objects, total count)
"""
sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
assert sites_results, "At least one site is required to search"
site_paths = [site.path for site in sites_results]
sites_group = SitesGroup(datasrc, sites, site_paths)
return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
```