This is page 7 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/guides/katana.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>Katana MCP Setup Guide — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="../_static/jquery.js?v=5d32c60e"></script>
15 | <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="../_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="../_static/doctools.js?v=888ff710"></script>
18 | <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="../_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="../genindex.html" />
21 | <link rel="search" title="Search" href="../search.html" />
22 | <link rel="next" title="SiteOne MCP Setup Guide" href="siteone.html" />
23 | <link rel="prev" title="InterroBot MCP Setup Guide" href="interrobot.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="../index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
48 | <li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
49 | <li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
50 | <li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
51 | <li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
52 | <li class="toctree-l2 current"><a class="current reference internal" href="#">Katana MCP Setup Guide</a></li>
53 | <li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
54 | <li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
55 | <li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
56 | </ul>
57 | </li>
58 | <li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
59 | <li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
60 | <li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
61 | </ul>
62 |
63 | </div>
64 | </div>
65 | </nav>
66 |
67 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
68 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
69 | <a href="../index.html">mcp-server-webcrawl</a>
70 | </nav>
71 |
72 | <div class="wy-nav-content">
73 | <div class="rst-content">
74 | <div role="navigation" aria-label="Page navigation">
75 | <ul class="wy-breadcrumbs">
76 | <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
77 | <li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
78 | <li class="breadcrumb-item active">Katana MCP Setup Guide</li>
79 | <li class="wy-breadcrumbs-aside">
80 | <a href="../_sources/guides/katana.rst.txt" rel="nofollow"> View page source</a>
81 | </li>
82 | </ul>
83 | <hr/>
84 | </div>
85 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
86 | <div itemprop="articleBody">
87 |
88 | <section id="katana-mcp-setup-guide">
89 | <h1>Katana MCP Setup Guide<a class="headerlink" href="#katana-mcp-setup-guide" title="Link to this heading"></a></h1>
90 | <p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with <a class="reference external" href="https://github.com/projectdiscovery/katana">Katana</a> crawler.
91 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve crawled using Katana.</p>
92 | <iframe width="560" height="315" src="https://www.youtube.com/embed/sOMaojm0R0Y" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
93 | <section id="requirements">
94 | <h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
95 | <p>Before you begin, ensure you have:</p>
96 | <ul class="simple">
97 | <li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
98 | <li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
99 | <li><p><a class="reference external" href="https://go.dev/doc/install">Go programming language</a> installed</p></li>
100 | <li><p><a class="reference external" href="https://github.com/projectdiscovery/katana">Katana crawler</a> installed</p></li>
101 | <li><p>Basic familiarity with command line interfaces</p></li>
102 | </ul>
103 | </section>
104 | <section id="what-is-katana">
105 | <h2>What is Katana?<a class="headerlink" href="#what-is-katana" title="Link to this heading"></a></h2>
106 | <p>Katana is an open-source web crawler from Project Discovery that offers:</p>
107 | <ul class="simple">
108 | <li><p>Fast and efficient web crawling capabilities</p></li>
109 | <li><p>Command-line interface for flexibility and automation</p></li>
110 | <li><p>Highly configurable crawling parameters</p></li>
111 | <li><p>Ability to store complete HTTP responses for analysis</p></li>
112 | </ul>
113 | </section>
114 | <section id="installation-steps">
115 | <h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
116 | <section id="install-mcp-server-webcrawl">
117 | <h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
118 | <p>Open your terminal or command line and install the package:</p>
119 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
120 | </pre></div>
121 | </div>
122 | <p>Verify installation was successful:</p>
123 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
124 | </pre></div>
125 | </div>
126 | </section>
127 | <section id="install-and-run-katana">
128 | <h3>2. Install and Run Katana<a class="headerlink" href="#install-and-run-katana" title="Link to this heading"></a></h3>
129 | <ol class="arabic">
130 | <li><p>Verify Go is installed and on your PATH:</p>
131 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">go</span> <span class="n">version</span>
132 | </pre></div>
133 | </div>
134 | </li>
135 | <li><p>Install Katana using Go:</p>
136 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">go</span> <span class="n">install</span> <span class="n">github</span><span class="o">.</span><span class="n">com</span><span class="o">/</span><span class="n">projectdiscovery</span><span class="o">/</span><span class="n">katana</span><span class="o">/</span><span class="n">cmd</span><span class="o">/</span><span class="n">katana</span><span class="nd">@latest</span>
137 | </pre></div>
138 | </div>
139 | </li>
140 | <li><p>Create a directory for your crawls and run Katana with storage options:</p>
141 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create a directory for storing crawls</span>
142 | <span class="n">mkdir</span> <span class="n">crawls</span>
143 |
144 | <span class="c1"># Run Katana with storage options</span>
145 | <span class="n">katana</span> <span class="o">-</span><span class="n">u</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">example</span><span class="o">.</span><span class="n">com</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span><span class="o">-</span><span class="nb">dir</span> <span class="n">archives</span><span class="o">/</span><span class="n">example</span><span class="o">.</span><span class="n">com</span><span class="o">/</span>
146 | </pre></div>
147 | </div>
148 | </li>
149 | <li><p>Repeat for additional websites as needed:</p>
150 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">katana</span> <span class="o">-</span><span class="n">u</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">pragmar</span><span class="o">.</span><span class="n">com</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span><span class="o">-</span><span class="nb">dir</span> <span class="n">archives</span><span class="o">/</span><span class="n">pragmar</span><span class="o">.</span><span class="n">com</span><span class="o">/</span>
151 | </pre></div>
152 | </div>
153 | </li>
154 | </ol>
155 | <p>In this case, the ./archives directory is the datasrc. The crawler will create
156 | a separate host directory for each unique host within
157 | the specified directory. This is consistent with the behavior of Katana,
158 | example.com/example.com is expected. Sites with external dependencies will branch
159 | out by origin host in the -store-response-dir, and continue to be searchable as a
160 | singular site search.</p>
161 | </section>
162 | <section id="configure-claude-desktop">
163 | <h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
164 | <ol class="arabic simple">
165 | <li><p>Open Claude Desktop</p></li>
166 | <li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
167 | <li><p>Add the following configuration (modify paths as needed):</p></li>
168 | </ol>
169 | <div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
170 | <span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
171 | <span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
172 | <span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
173 | <span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"katana"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
174 | <span class="w"> </span><span class="s2">"/path/to/katana/crawls/"</span><span class="p">]</span>
175 | <span class="w"> </span><span class="p">}</span>
176 | <span class="w"> </span><span class="p">}</span>
177 | <span class="p">}</span>
178 | </pre></div>
179 | </div>
180 | <div class="admonition note">
181 | <p class="admonition-title">Note</p>
182 | <ul class="simple">
183 | <li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
184 | <li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
185 | <li><p>Change <code class="docutils literal notranslate"><span class="pre">/path/to/katana/crawls/</span></code> to the actual path where you stored your Katana crawls</p></li>
186 | </ul>
187 | </div>
188 | <ol class="arabic simple" start="4">
189 | <li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
190 | <li><p>Restart Claude Desktop</p></li>
191 | </ol>
192 | </section>
193 | <section id="verify-and-use">
194 | <h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
195 | <ol class="arabic">
196 | <li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
197 | <li><p>Ask Claude to list your crawled sites:</p>
198 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
199 | </pre></div>
200 | </div>
201 | </li>
202 | <li><p>Try searching content from your crawls:</p>
203 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
204 | </pre></div>
205 | </div>
206 | </li>
207 | <li><p>Try specialized searches that use Katana’s comprehensive data collection:</p>
208 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Can</span> <span class="n">you</span> <span class="n">find</span> <span class="nb">all</span> <span class="n">the</span> <span class="n">help</span> <span class="n">pages</span> <span class="n">on</span> <span class="n">this</span> <span class="n">site</span> <span class="ow">and</span> <span class="n">tell</span> <span class="n">me</span> <span class="n">how</span> <span class="n">they</span><span class="s1">'re different?</span>
209 | </pre></div>
210 | </div>
211 | </li>
212 | </ol>
213 | </section>
214 | </section>
215 | <section id="troubleshooting">
216 | <h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
217 | <ul class="simple">
218 | <li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
219 | <li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
220 | <li><p>Check that your Katana crawls directory path in the configuration is correct</p></li>
221 | <li><p>Make sure the <code class="docutils literal notranslate"><span class="pre">-store-response</span></code> flag was used during crawling, as this is required to save content</p></li>
222 | <li><p>Verify that each crawl completed successfully and files were saved to the expected location</p></li>
223 | <li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
224 | </ul>
225 | <p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
226 | </section>
227 | </section>
228 |
229 |
230 | </div>
231 | </div>
232 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
233 | <a href="interrobot.html" class="btn btn-neutral float-left" title="InterroBot MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
234 | <a href="siteone.html" class="btn btn-neutral float-right" title="SiteOne MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
235 | </div>
236 |
237 | <hr/>
238 |
239 | <div role="contentinfo">
240 | <p>© Copyright 2025, pragmar.</p>
241 | </div>
242 |
243 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
244 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
245 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
246 |
247 |
248 | </footer>
249 | </div>
250 | </div>
251 | </section>
252 | </div>
253 | <script>
254 | jQuery(function () {
255 | SphinxRtdTheme.Navigation.enable(true);
256 | });
257 | </script>
258 |
259 | </body>
260 | </html>
```
--------------------------------------------------------------------------------
/docs/guides/archivebox.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>ArchiveBox MCP Setup Guide — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="../_static/jquery.js?v=5d32c60e"></script>
15 | <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="../_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="../_static/doctools.js?v=888ff710"></script>
18 | <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="../_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="../genindex.html" />
21 | <link rel="search" title="Search" href="../search.html" />
22 | <link rel="next" title="HTTrack MCP Setup Guide" href="httrack.html" />
23 | <link rel="prev" title="Setup Guides" href="../guides.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="../index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
48 | <li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
49 | <li class="toctree-l2 current"><a class="current reference internal" href="#">ArchiveBox MCP Setup Guide</a></li>
50 | <li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
51 | <li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
52 | <li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
53 | <li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
54 | <li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
55 | <li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
56 | </ul>
57 | </li>
58 | <li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
59 | <li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
60 | <li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
61 | </ul>
62 |
63 | </div>
64 | </div>
65 | </nav>
66 |
67 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
68 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
69 | <a href="../index.html">mcp-server-webcrawl</a>
70 | </nav>
71 |
72 | <div class="wy-nav-content">
73 | <div class="rst-content">
74 | <div role="navigation" aria-label="Page navigation">
75 | <ul class="wy-breadcrumbs">
76 | <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
77 | <li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
78 | <li class="breadcrumb-item active">ArchiveBox MCP Setup Guide</li>
79 | <li class="wy-breadcrumbs-aside">
80 | <a href="../_sources/guides/archivebox.rst.txt" rel="nofollow"> View page source</a>
81 | </li>
82 | </ul>
83 | <hr/>
84 | </div>
85 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
86 | <div itemprop="articleBody">
87 |
88 | <section id="archivebox-mcp-setup-guide">
89 | <h1>ArchiveBox MCP Setup Guide<a class="headerlink" href="#archivebox-mcp-setup-guide" title="Link to this heading"></a></h1>
90 | <p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with <a class="reference external" href="https://archivebox.io/">ArchiveBox</a>.
91 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve archived using ArchiveBox.</p>
92 | <iframe width="560" height="315" src="https://www.youtube.com/embed/0KFqhSYf3f4" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
93 | <section id="requirements">
94 | <h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
95 | <p>Before you begin, ensure you have:</p>
96 | <ul class="simple">
97 | <li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
98 | <li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
99 | <li><p><a class="reference external" href="https://archivebox.io/">ArchiveBox</a> installed</p></li>
100 | <li><p>Basic familiarity with command line interfaces</p></li>
101 | </ul>
102 | </section>
103 | <section id="what-is-archivebox">
104 | <h2>What is ArchiveBox?<a class="headerlink" href="#what-is-archivebox" title="Link to this heading"></a></h2>
105 | <p>ArchiveBox is a powerful open-source web archiving solution that offers:</p>
106 | <ul class="simple">
107 | <li><p>Multiple output formats (HTML, PDF, screenshots, WARC, etc.)</p></li>
108 | <li><p>Comprehensive metadata</p></li>
109 | <li><p>CLI + webadmin for browsing and managing archives</p></li>
110 | <li><p>Support for various input sources (URLs, browser bookmarks, RSS feeds)</p></li>
111 | <li><p>Self-hosted solution for long-term web content preservation</p></li>
112 | </ul>
113 | </section>
114 | <section id="installation-steps">
115 | <h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
116 | <section id="install-mcp-server-webcrawl">
117 | <h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
118 | <p>Open your terminal or command line and install the package:</p>
119 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
120 | </pre></div>
121 | </div>
122 | <p>Verify installation was successful:</p>
123 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
124 | </pre></div>
125 | </div>
126 | </section>
127 | <section id="install-and-set-up-archivebox">
128 | <h3>2. Install and Set Up ArchiveBox<a class="headerlink" href="#install-and-set-up-archivebox" title="Link to this heading"></a></h3>
129 | <p>macOS/Linux only, Windows may work under Docker but is untested.</p>
130 | <ol class="arabic">
131 | <li><p>Install ArchiveBox (macOS/Linux):</p>
132 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">archivebox</span>
133 | </pre></div>
134 | </div>
135 | </li>
136 | <li><p>macOS only, install brew and wget:</p>
137 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">brew</span> <span class="n">install</span> <span class="n">wget</span>
138 | </pre></div>
139 | </div>
140 | </li>
141 | <li><p>Create ArchiveBox collections. Unlike other crawlers that focus on single websites, ArchiveBox uses a collection-based approach where each collection can contain multiple URLs. You can create separate content for different projects or group related URLs together:</p>
142 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create a directory structure for your collections</span>
143 | <span class="n">mkdir</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span>
144 |
145 | <span class="c1"># Create an "example" collection</span>
146 | <span class="n">mkdir</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">example</span>
147 | <span class="n">cd</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">example</span>
148 | <span class="n">archivebox</span> <span class="n">init</span>
149 | <span class="n">archivebox</span> <span class="n">add</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">example</span><span class="o">.</span><span class="n">com</span>
150 |
151 | <span class="c1"># Create a "pragmar" collection</span>
152 | <span class="n">mkdir</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">pragmar</span>
153 | <span class="n">cd</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">pragmar</span>
154 | <span class="n">archivebox</span> <span class="n">init</span>
155 | <span class="n">archivebox</span> <span class="n">add</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">pragmar</span><span class="o">.</span><span class="n">com</span>
156 | </pre></div>
157 | </div>
158 | </li>
159 | <li><p>Each <code class="docutils literal notranslate"><span class="pre">archivebox</span> <span class="pre">init</span></code> creates a complete ArchiveBox instance with its own database and archive directory structure. The typical structure includes:</p>
160 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>collection-name/
161 | ├── archive/ # Archived content organized by timestamp
162 | ├── logs/ # ArchiveBox operation logs
163 | ├── sources/ # Source URL lists and metadata
164 | └── index.sqlite3 # Database containing all metadata
165 | </pre></div>
166 | </div>
167 | </li>
168 | </ol>
169 | </section>
170 | <section id="configure-claude-desktop">
171 | <h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
172 | <ol class="arabic simple">
173 | <li><p>Open Claude Desktop</p></li>
174 | <li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
175 | <li><p>Add the following configuration (modify paths as needed):</p></li>
176 | </ol>
177 | <div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
178 | <span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
179 | <span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
180 | <span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
181 | <span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"archivebox"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
182 | <span class="w"> </span><span class="s2">"/path/to/archivebox-data/"</span><span class="p">]</span>
183 | <span class="w"> </span><span class="p">}</span>
184 | <span class="w"> </span><span class="p">}</span>
185 | <span class="p">}</span>
186 | </pre></div>
187 | </div>
188 | <div class="admonition note">
189 | <p class="admonition-title">Note</p>
190 | <ul class="simple">
191 | <li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
192 | <li><p>On macOS/Linux, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
193 | <li><p>The datasrc path should point to the parent directory containing your ArchiveBox collections (e.g., <code class="docutils literal notranslate"><span class="pre">~/archivebox-data/</span></code>), not to individual collection directories</p></li>
194 | <li><p>Each collection directory (example, pragmar, etc.) will appear as a separate “site” in MCP</p></li>
195 | </ul>
196 | </div>
197 | <ol class="arabic simple" start="4">
198 | <li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
199 | <li><p>Restart Claude Desktop</p></li>
200 | </ol>
201 | </section>
202 | <section id="verify-and-use">
203 | <h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
204 | <ol class="arabic">
205 | <li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
206 | <li><p>Ask Claude to list your archived sites:</p>
207 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
208 | </pre></div>
209 | </div>
210 | </li>
211 | <li><p>Try searching content from your archives:</p>
212 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [archived site]?
213 | </pre></div>
214 | </div>
215 | </li>
216 | <li><p>Use the rich metadata for content discovery:</p>
217 | <div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find all the archived pages related to [keyword] from [archive]?
218 | </pre></div>
219 | </div>
220 | </li>
221 | </ol>
222 | </section>
223 | </section>
224 | <section id="troubleshooting">
225 | <h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
226 | <ul class="simple">
227 | <li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
228 | <li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
229 | <li><p>Check that your ArchiveBox archive directory path in the configuration is correct</p></li>
230 | <li><p>Make sure ArchiveBox has successfully archived the websites and created the database</p></li>
231 | <li><p>Verify that files exist in your archive/[timestamp] directories</p></li>
232 | <li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
233 | <li><p>For large archives, initial indexing may take some time during the first search</p></li>
234 | </ul>
235 | <p>ArchiveBox’s comprehensive archiving capabilities combined with mcp-server-webcrawl provide powerful tools for content preservation, research, and analysis across your archived web content.</p>
236 | <p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
237 | </section>
238 | </section>
239 |
240 |
241 | </div>
242 | </div>
243 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
244 | <a href="../guides.html" class="btn btn-neutral float-left" title="Setup Guides" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
245 | <a href="httrack.html" class="btn btn-neutral float-right" title="HTTrack MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
246 | </div>
247 |
248 | <hr/>
249 |
250 | <div role="contentinfo">
251 | <p>© Copyright 2025, pragmar.</p>
252 | </div>
253 |
254 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
255 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
256 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
257 |
258 |
259 | </footer>
260 | </div>
261 | </div>
262 | </section>
263 | </div>
264 | <script>
265 | jQuery(function () {
266 | SphinxRtdTheme.Navigation.enable(true);
267 | });
268 | </script>
269 |
270 | </body>
271 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/search.py:
--------------------------------------------------------------------------------
```python
1 | from logging import Logger
2 | from typing import Any
3 |
4 | from mcp_server_webcrawl.utils.logger import get_logger
5 | from mcp_server_webcrawl.utils.parser import SearchLexer, SearchParser, SearchSubquery
6 |
7 | # url is technically fts but handled differently, uses LIKE; without type in
8 | # fts field mode, the "A long chained OR should not return all results" fails
9 | FTS5_MATCH_FIELDS: list[str] = ["type", "headers", "content"]
10 |
11 | logger: Logger = get_logger()
12 |
13 | class ParameterManager:
14 | """
15 | Helper class to manage SQL parameter naming and counting.
16 | """
17 | def __init__(self):
18 | self.params: dict[str, str | int | float] = {}
19 | self.counter: int = 0
20 |
21 | def add_param(self, value: str | int | float) -> str:
22 | """
23 | Add a parameter and return its name.
24 | """
25 | assert isinstance(value, (str, int, float)), f"Parameter value must be str, int, or float."
26 | param_name: str = f"query{self.counter}"
27 | self.params[param_name] = value
28 | self.counter += 1
29 | return param_name
30 |
31 | def get_params(self) -> dict[str, str | int | float]:
32 | """
33 | Get all accumulated parameters.
34 | """
35 | return self.params
36 |
37 | class SearchQueryParser:
38 | """
39 | Implementation of ply lexer to capture field-expanded boolean queries.
40 | """
41 |
42 | def __init__(self):
43 | self.lexer: SearchLexer = SearchLexer()
44 | self.parser: SearchParser = SearchParser(self.lexer)
45 |
46 | def get_fulltext_terms(self, query: str) -> list[str]:
47 | """
48 | Extract fulltext search terms from a query string.
49 | Returns list of search terms suitable for snippet extraction.
50 | """
51 | parsed_query: list[SearchSubquery] = self.parse(query)
52 | search_terms: list[str] = []
53 | fulltext_fields: tuple[str | None, ...] = ("content", "headers", "fulltext", "", None)
54 |
55 | # prepare for match, lowercase, and eliminate wildcards
56 | for subquery in parsed_query:
57 | if subquery.field in fulltext_fields:
58 | term: str = str(subquery.value).lower().strip("*")
59 | if term:
60 | search_terms.append(term)
61 |
62 | return search_terms
63 |
64 | def parse(self, query_string: str) -> list[SearchSubquery]:
65 | """
66 | Parse a query string into a list of SearchSubquery instances
67 | """
68 | result: SearchSubquery | list[SearchSubquery] = self.parser.parser.parse(query_string, lexer=self.lexer.lexer)
69 |
70 | if isinstance(result, SearchSubquery):
71 | return [result]
72 | elif isinstance(result, list) and all(isinstance(item, SearchSubquery) for item in result):
73 | return result
74 | else:
75 | return []
76 |
77 | def to_sqlite_fts(
78 | self,
79 | parsed_query: list[SearchSubquery],
80 | swap_values: dict[str, dict[str, str | int]] = {}
81 | ) -> tuple[list[str], dict[str, str | int]]:
82 | """
83 | Convert the parsed query to SQLite FTS5 compatible WHERE clause components.
84 | Returns a tuple of (query_parts, params) where query_parts is a list of SQL
85 | conditions and params is a dictionary of parameter values with named parameters.
86 | """
87 | query_parts: list[str] = []
88 | param_manager: ParameterManager = ParameterManager()
89 | current_index: int = 0
90 |
91 | while current_index < len(parsed_query):
92 | subquery: SearchSubquery = parsed_query[current_index]
93 |
94 | # fts vs pure sql is handled differently
95 | if not subquery.field or subquery.field in FTS5_MATCH_FIELDS:
96 |
97 | # check if previous subquery targeted this FTS field with NOT
98 | previous_subquery: SearchSubquery | None = parsed_query[current_index - 1] if current_index > 0 else None
99 | has_unary_not: bool = "NOT" in subquery.modifiers
100 | has_binary_not: bool = previous_subquery and previous_subquery.operator == "NOT"
101 | should_negate: bool = has_unary_not or has_binary_not
102 |
103 | # group consecutive fulltext terms with their operators
104 | fts_field_query: dict[str, str | int] = self.__build_fts_field_subquery(parsed_query, subquery.field, current_index, swap_values)
105 | if fts_field_query["querystring"]:
106 | param_name: str = param_manager.add_param(fts_field_query["querystring"])
107 | field_name: str = "fulltext" if subquery.field is None else subquery.field
108 | safe_sql_field: str = subquery.get_safe_sql_field(field_name)
109 |
110 | # handle NOT with subquery to avoid JOIN issues
111 | if should_negate:
112 | # generate subquery exclusion pattern to avoid JOIN + NOT (MATCH) issues
113 | sql_part: str = f"ResourcesFullText.Id NOT IN (SELECT Id FROM ResourcesFullText WHERE {safe_sql_field} MATCH :{param_name})"
114 | else:
115 | sql_part: str = f"{safe_sql_field} MATCH :{param_name}"
116 |
117 | query_parts.append(sql_part)
118 | current_index = fts_field_query["next_index"]
119 |
120 | else:
121 |
122 | # handle field searches
123 | sql_part: str = ""
124 | field: str = subquery.field
125 | processed_value: str | int | float = self.__process_field_value(field, subquery.value, swap_values)
126 | value_type: str = subquery.type
127 | modifiers: list[str] = subquery.modifiers
128 |
129 | # check if prior subquery targeted this with NOT
130 | previous_subquery: SearchSubquery | None = parsed_query[current_index - 1] if current_index > 0 else None
131 |
132 | # NOT modifier if present
133 | if "NOT" in modifiers:
134 | sql_part += "NOT "
135 | elif previous_subquery and previous_subquery.operator == "NOT":
136 | sql_part += "NOT "
137 |
138 | safe_sql_field: str = subquery.get_safe_sql_field(field)
139 | if field in self.parser.numeric_fields:
140 | param_name: str = param_manager.add_param(processed_value)
141 | sql_part += f"{safe_sql_field} {subquery.comparator} :{param_name}"
142 | else:
143 | # headers currently handled FTS5_MATCH_FIELDS handler
144 | if field == "url":
145 | # Use LIKE for certain field searches instead of MATCH, maximize the hits
146 | # with %LIKE%. Think of https://example.com/logo.png?cache=20250112
147 | # and a search of url: *.png and the 10s of ways broader match is better
148 | # fit for intention
149 | sql_part += f"{safe_sql_field} LIKE :"
150 | trimmed_url: str = str(processed_value).strip("*\"'`")
151 | param_name: str = param_manager.add_param(f"%{trimmed_url}%")
152 | sql_part += param_name
153 | elif value_type == "phrase":
154 | formatted_term: str = self.__format_search_term(processed_value, value_type)
155 | param_name: str = param_manager.add_param(formatted_term)
156 | sql_part += f"{safe_sql_field} MATCH :{param_name}"
157 | else:
158 | # default fts query
159 | param_name: str = param_manager.add_param(processed_value)
160 | safe_sql_field: str = subquery.get_safe_sql_field("fulltext")
161 | sql_part += f"{safe_sql_field} MATCH :{param_name}"
162 |
163 | query_parts.append(sql_part)
164 | current_index += 1
165 |
166 | # add operator between clauses
167 | if current_index < len(parsed_query):
168 | # look at the previous subquery's operator to determine how to connect
169 | previous_subquery: SearchSubquery | None = parsed_query[current_index - 1] if current_index > 0 else None
170 | if previous_subquery and previous_subquery.operator:
171 | # skip NOT - it will be handled by the next clause
172 | # sqlite doesn't support interclause NOT, errors/0 results
173 | # AND NOT is the way (FTS is different)
174 | op: str = previous_subquery.operator if previous_subquery.operator != "NOT" else "AND"
175 | else:
176 | op: str = "AND" # default
177 | query_parts.append(op)
178 |
179 | return query_parts, param_manager.get_params()
180 |
181 | def __build_fts_field_subquery(
182 | self,
183 | parsed_query: list[SearchSubquery],
184 | field: str | None,
185 | start_index: int,
186 | swap_values: dict[str, dict[str, str | int]] = {}
187 | ) -> dict[str, str | int]:
188 | """
189 | The rule is one MATCH per column for fts5, so multiple pure booleans are compressed
190 | into thier own little querystring, attempting to preserve the Boolean intent of the
191 | original SearchSubquery substructure. There are complexity limits here. Group IDs
192 | preserve the parenthetical home of each SearchSubquery, None if not in parens.
193 | """
194 |
195 | current_index: int = start_index
196 |
197 | # this modifies subqueries in place, prevents fts conversion leaking
198 | parsed_query: list[SearchSubquery] = self.__normalize_fts_match_operators(parsed_query)
199 |
200 | # determine the condition for continuing the loop based on field type
201 | def continue_sequencing(subquery_field: str | None) -> bool:
202 | return subquery_field is None if field is None else subquery_field == field
203 |
204 | # group consecutive, group is None unless parenthetical (A OR B)
205 | groups: list[tuple[Any, list[tuple[str, str | None]]]] = []
206 | current_group: list[tuple[str, str | None]] = []
207 | current_group_id: Any = None
208 |
209 | while current_index < len(parsed_query) and continue_sequencing(parsed_query[current_index].field):
210 | subquery: SearchSubquery = parsed_query[current_index]
211 |
212 | # new group
213 | if subquery.group != current_group_id:
214 | if current_group:
215 | groups.append((current_group_id, current_group))
216 | current_group = []
217 | current_group_id = subquery.group
218 |
219 | processed_value: str | int | float = self.__process_field_value(field, subquery.value, swap_values)
220 | formatted_term: str = self.__format_search_term(processed_value, subquery.type, subquery.modifiers)
221 | current_group.append((formatted_term, subquery.operator))
222 | current_index += 1
223 |
224 | # last group
225 | if current_group:
226 | groups.append((current_group_id, current_group))
227 |
228 | # build query string with parentheses for grouped terms
229 | query_parts: list[str] = []
230 | for group_id, group_terms in groups:
231 | if group_id is not None and len(group_terms) > 1:
232 | # multiple terms in a group, add parentheses
233 | group_str: str = ""
234 | for i, (term, operator) in enumerate(group_terms):
235 | group_str += term
236 | if operator and i < len(group_terms) - 1:
237 | group_str += f" {operator} "
238 | query_parts.append(f"({group_str})")
239 | else:
240 | # single term or ungrouped, no parentheses
241 | for i, (term, operator) in enumerate(group_terms):
242 | query_parts.append(term)
243 | if operator and i < len(group_terms) - 1:
244 | query_parts.append(operator)
245 |
246 | # add inter-group operator (from last term in previous group)
247 | if groups.index((group_id, group_terms)) < len(groups) - 1:
248 | last_term: tuple[str, str | None] = group_terms[-1]
249 | if last_term[1]: # operator exists
250 | query_parts.append(last_term[1])
251 |
252 | querystring: str = " ".join(query_parts)
253 | return {
254 | "querystring": querystring,
255 | "next_index": current_index
256 | }
257 |
258 | def __format_search_term(
259 | self,
260 | value: str | int | float,
261 | value_type: str,
262 | modifiers: list[str] | None = None
263 | ) -> str:
264 | """
265 | Format a fulltext search term based on type and modifiers. This takes some
266 | of the sharp edges of the secondary fts5 parser in conversion.
267 |
268 | Args:
269 | value: The search value
270 | value_type: Type of value ('term', 'phrase', 'wildcard')
271 | modifiers: List of modifiers (e.g., ['NOT'])
272 |
273 | Returns:
274 | Formatted search term string
275 | """
276 | modifiers: list[str] = modifiers or []
277 | value_string: str = str(value)
278 |
279 | if value_type == "phrase":
280 | return f'"{value_string}"'
281 | elif value_type == "wildcard":
282 | # for wildcards, only quote if contains hyphens/spaces require it
283 | if "-" in value_string or " " in value_string:
284 | return f'"{value_string}"*'
285 | else:
286 | return f"{value_string}*"
287 | else:
288 | # for terms like one-click etc.
289 | # avoid confusing the secondary fts parser
290 | # where hyphens in unquoted matches can be confused for
291 | # fts negation (-term)
292 | if '-' in value_string:
293 | return f'"{value_string}"'
294 | else:
295 | return value_string
296 |
297 | def __normalize_fts_match_operators(self, parsed_query: list[SearchSubquery]) -> list[SearchSubquery]:
298 | """
299 | Clean up operators on fulltext sequences so they don't leak into interclause SQL
300 | Why? ONE MATCH per column. the SearchSubquery sharing a fts field must be compressed
301 | into a single MATCH. If the next clause does not share the same field as current, it
302 | requires an operator set to None so as not to leak into the next field. Basically,
303 | this firewalls boolean logic for combined fts subqueries. The flagship error of not
304 | doing this is to have "this OR that OR there" return unfiltered or 0 results instead
305 | of the appropriate number. (unfiltered in the case of a leaky OR status: >=100, which
306 | instead of defining the result should limit it)
307 |
308 | NOT operations must NEVER be cleared because they always represent
309 | separate SQL exclusion clauses. Only clear AND/OR when they would cause leakage.
310 | """
311 | for i in range(len(parsed_query) - 1):
312 | current: SearchSubquery = parsed_query[i]
313 | next_item: SearchSubquery = parsed_query[i + 1]
314 |
315 | # never clear NOT operators, they need separate SQL clauses for exclusion
316 | if current.operator == "NOT":
317 | continue
318 |
319 | # only clear AND/OR operators in transitions that would cause SQL leakage
320 | # while preserving legitimate inter-clause boolean operations
321 | # clear when transitioning from fulltext to non-FTS field
322 | if (current.field is None and
323 | next_item.field is not None and
324 | next_item.field not in FTS5_MATCH_FIELDS):
325 | current.operator = None
326 |
327 | # clear when transitioning from FTS field to non-FTS field
328 | elif (current.field in FTS5_MATCH_FIELDS and
329 | next_item.field is not None and
330 | next_item.field not in FTS5_MATCH_FIELDS):
331 | current.operator = None
332 |
333 | return parsed_query
334 |
335 | def __process_field_value(
336 | self,
337 | field: str | None,
338 | value_dict: dict[str, str] | str | int,
339 | swap_values: dict[str, dict[str, str | int]] | None = None
340 | ) -> str | int | float:
341 | """
342 | Process and validate a field value with type conversion and swapping.
343 |
344 | Args:
345 | field: The field name (or None for fulltext)
346 | value_dict: Dictionary with 'value' and 'type' keys, or raw value
347 | swap_values: Optional dictionary for value replacement
348 |
349 | Returns:
350 | Processed value (string, int, or float)
351 | """
352 | if isinstance(value_dict, dict):
353 | value: str | int = value_dict["value"]
354 | else:
355 | value: str | int = value_dict # raw value
356 |
357 | if swap_values:
358 | swap_key: str = field if field else ""
359 | if swap_key in swap_values and value in swap_values[swap_key]:
360 | value = swap_values[swap_key][value]
361 |
362 | if field and field in self.parser.numeric_fields:
363 | try:
364 | return int(value)
365 | except ValueError:
366 | try:
367 | return float(value)
368 | except ValueError:
369 | raise ValueError(f"Field {field} requires a numeric value, got: {value}")
370 |
371 | return value
372 |
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.utils.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>mcp_server_webcrawl.utils package — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="prev" title="mcp_server_webcrawl.templates package" href="mcp_server_webcrawl.templates.html" />
23 | </head>
24 |
25 | <body class="wy-body-for-nav">
26 | <div class="wy-grid-for-nav">
27 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
28 | <div class="wy-side-scroll">
29 | <div class="wy-side-nav-search" >
30 |
31 |
32 |
33 | <a href="index.html" class="icon icon-home">
34 | mcp-server-webcrawl
35 | </a>
36 | <div role="search">
37 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
38 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
39 | <input type="hidden" name="check_keywords" value="yes" />
40 | <input type="hidden" name="area" value="default" />
41 | </form>
42 | </div>
43 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
44 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
45 | <ul class="current">
46 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
50 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
51 | <li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
52 | <li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
53 | </ul>
54 | </li>
55 | </ul>
56 |
57 | </div>
58 | </div>
59 | </nav>
60 |
61 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
62 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
63 | <a href="index.html">mcp-server-webcrawl</a>
64 | </nav>
65 |
66 | <div class="wy-nav-content">
67 | <div class="rst-content">
68 | <div role="navigation" aria-label="Page navigation">
69 | <ul class="wy-breadcrumbs">
70 | <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
71 | <li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
72 | <li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
73 | <li class="breadcrumb-item active">mcp_server_webcrawl.utils package</li>
74 | <li class="wy-breadcrumbs-aside">
75 | <a href="_sources/mcp_server_webcrawl.utils.rst.txt" rel="nofollow"> View page source</a>
76 | </li>
77 | </ul>
78 | <hr/>
79 | </div>
80 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
81 | <div itemprop="articleBody">
82 |
83 | <section id="mcp-server-webcrawl-utils-package">
84 | <h1>mcp_server_webcrawl.utils package<a class="headerlink" href="#mcp-server-webcrawl-utils-package" title="Link to this heading"></a></h1>
85 | <section id="submodules">
86 | <h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
87 | </section>
88 | <section id="module-mcp_server_webcrawl.utils.cli">
89 | <span id="mcp-server-webcrawl-utils-cli-module"></span><h2>mcp_server_webcrawl.utils.cli module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.cli" title="Link to this heading"></a></h2>
90 | <dl class="py function">
91 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.cli.get_help_short_message">
92 | <span class="sig-name descname"><span class="pre">get_help_short_message</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">version</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/cli.html#get_help_short_message"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.cli.get_help_short_message" title="Link to this definition"></a></dt>
93 | <dd><dl class="field-list simple">
94 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
95 | <dd class="field-odd"><p><strong>version</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p>
96 | </dd>
97 | <dt class="field-even">Return type<span class="colon">:</span></dt>
98 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
99 | </dd>
100 | </dl>
101 | </dd></dl>
102 |
103 | <dl class="py function">
104 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.cli.get_help_long_message">
105 | <span class="sig-name descname"><span class="pre">get_help_long_message</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">version</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/cli.html#get_help_long_message"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.cli.get_help_long_message" title="Link to this definition"></a></dt>
106 | <dd><dl class="field-list simple">
107 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
108 | <dd class="field-odd"><p><strong>version</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p>
109 | </dd>
110 | <dt class="field-even">Return type<span class="colon">:</span></dt>
111 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
112 | </dd>
113 | </dl>
114 | </dd></dl>
115 |
116 | </section>
117 | <section id="module-mcp_server_webcrawl.utils.logger">
118 | <span id="mcp-server-webcrawl-utils-logger-module"></span><h2>mcp_server_webcrawl.utils.logger module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.logger" title="Link to this heading"></a></h2>
119 | <dl class="py function">
120 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.logger.get_logger_configuration">
121 | <span class="sig-name descname"><span class="pre">get_logger_configuration</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/logger.html#get_logger_configuration"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.logger.get_logger_configuration" title="Link to this definition"></a></dt>
122 | <dd><p>Get log name, path, and level (in that order)</p>
123 | <dl class="field-list simple">
124 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
125 | <dd class="field-odd"><p>A tuple containing name, path, and level</p>
126 | </dd>
127 | <dt class="field-even">Return type<span class="colon">:</span></dt>
128 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.14)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a>, Path, <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a>]</p>
129 | </dd>
130 | </dl>
131 | </dd></dl>
132 |
133 | <dl class="py function">
134 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.logger.get_logger">
135 | <span class="sig-name descname"><span class="pre">get_logger</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/logger.html#get_logger"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.logger.get_logger" title="Link to this definition"></a></dt>
136 | <dd><p>Get logger, usually in order to write to it</p>
137 | <dl class="field-list simple">
138 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
139 | <dd class="field-odd"><p>a writable logging object (error/warn/info/debug)</p>
140 | </dd>
141 | <dt class="field-even">Return type<span class="colon">:</span></dt>
142 | <dd class="field-even"><p>Logger</p>
143 | </dd>
144 | </dl>
145 | </dd></dl>
146 |
147 | <dl class="py function">
148 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.logger.initialize_logger">
149 | <span class="sig-name descname"><span class="pre">initialize_logger</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/logger.html#initialize_logger"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.logger.initialize_logger" title="Link to this definition"></a></dt>
150 | <dd><p>Validate and set up logger for writing</p>
151 | <dl class="field-list simple">
152 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
153 | <dd class="field-odd"><p>None</p>
154 | </dd>
155 | <dt class="field-even">Return type<span class="colon">:</span></dt>
156 | <dd class="field-even"><p>None</p>
157 | </dd>
158 | </dl>
159 | </dd></dl>
160 |
161 | </section>
162 | <section id="module-mcp_server_webcrawl.utils.server">
163 | <span id="mcp-server-webcrawl-utils-server-module"></span><h2>mcp_server_webcrawl.utils.server module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.server" title="Link to this heading"></a></h2>
164 | <dl class="py function">
165 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.server.initialize_mcp_server">
166 | <span class="sig-name descname"><span class="pre">initialize_mcp_server</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/server.html#initialize_mcp_server"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.server.initialize_mcp_server" title="Link to this definition"></a></dt>
167 | <dd><p>MCP stdio streams require utf-8 explicitly set for Windows (default cp1252)
168 | or internationalized content will fail.</p>
169 | <dl class="field-list simple">
170 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
171 | <dd class="field-odd"><p>None</p>
172 | </dd>
173 | </dl>
174 | </dd></dl>
175 |
176 | </section>
177 | <section id="module-mcp_server_webcrawl.utils.tools">
178 | <span id="mcp-server-webcrawl-utils-tools-module"></span><h2>mcp_server_webcrawl.utils.tools module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.tools" title="Link to this heading"></a></h2>
179 | <dl class="py function">
180 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.tools.get_crawler_tools">
181 | <span class="sig-name descname"><span class="pre">get_crawler_tools</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sites</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/tools.html#get_crawler_tools"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.tools.get_crawler_tools" title="Link to this definition"></a></dt>
182 | <dd><p>Generate crawler tools based on available sites.</p>
183 | <dl class="field-list simple">
184 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
185 | <dd class="field-odd"><p><strong>sites</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site results to include in tool descriptions</p>
186 | </dd>
187 | <dt class="field-even">Returns<span class="colon">:</span></dt>
188 | <dd class="field-even"><p>List of Tool objects for sites and resources</p>
189 | </dd>
190 | </dl>
191 | </dd></dl>
192 |
193 | </section>
194 | <section id="module-mcp_server_webcrawl.utils">
195 | <span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.utils" title="Link to this heading"></a></h2>
196 | <dl class="py function">
197 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.to_isoformat_zulu">
198 | <span class="sig-name descname"><span class="pre">to_isoformat_zulu</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dt</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils.html#to_isoformat_zulu"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.to_isoformat_zulu" title="Link to this definition"></a></dt>
199 | <dd><p>Convert datetime to iso Z.</p>
200 | <p>python<=3.10 struggles with Z and fractions of seconds, will
201 | throw. smooth out the iso string, second precision isn’t key here</p>
202 | <dl class="field-list simple">
203 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
204 | <dd class="field-odd"><p><strong>dt</strong> (<a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a>) – </p>
205 | </dd>
206 | </dl>
207 | </dd></dl>
208 |
209 | <dl class="py function">
210 | <dt class="sig sig-object py" id="mcp_server_webcrawl.utils.from_isoformat_zulu">
211 | <span class="sig-name descname"><span class="pre">from_isoformat_zulu</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dt_string</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils.html#from_isoformat_zulu"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.from_isoformat_zulu" title="Link to this definition"></a></dt>
212 | <dd><p>Convert ISO string to datetime.</p>
213 | <p>python<=3.10 struggles with Z and fractions of seconds, will
214 | throw. smooth out the iso string, second precision isn’t key here</p>
215 | <dl class="field-list simple">
216 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
217 | <dd class="field-odd"><p><strong>dt_string</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em> | </em><em>None</em>) – </p>
218 | </dd>
219 | <dt class="field-even">Return type<span class="colon">:</span></dt>
220 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a></p>
221 | </dd>
222 | </dl>
223 | </dd></dl>
224 |
225 | </section>
226 | </section>
227 |
228 |
229 | </div>
230 | </div>
231 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
232 | <a href="mcp_server_webcrawl.templates.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.templates package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
233 | </div>
234 |
235 | <hr/>
236 |
237 | <div role="contentinfo">
238 | <p>© Copyright 2025, pragmar.</p>
239 | </div>
240 |
241 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
242 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
243 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
244 |
245 |
246 | </footer>
247 | </div>
248 | </div>
249 | </section>
250 | </div>
251 | <script>
252 | jQuery(function () {
253 | SphinxRtdTheme.Navigation.enable(true);
254 | });
255 | </script>
256 |
257 | </body>
258 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/document.py:
--------------------------------------------------------------------------------
```python
1 | import curses
2 | import textwrap
3 |
4 | from dataclasses import dataclass
5 | from typing import TYPE_CHECKING, Optional
6 |
7 | from mcp_server_webcrawl.interactive.ui import DocumentMode, ThemeDefinition, ViewBounds
8 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView, CONTENT_MARGIN
9 | from mcp_server_webcrawl.interactive.highlights import HighlightProcessor, HighlightSpan
10 | from mcp_server_webcrawl.models.resources import ResourceResult
11 | from mcp_server_webcrawl.interactive.ui import safe_addstr
12 | if TYPE_CHECKING:
13 | from mcp_server_webcrawl.interactive.session import InteractiveSession
14 |
15 | DOCUMENT_MODE_NEXT: dict[DocumentMode, DocumentMode] = {
16 | DocumentMode.MARKDOWN: DocumentMode.RAW,
17 | DocumentMode.RAW: DocumentMode.HEADERS,
18 | DocumentMode.HEADERS: DocumentMode.MARKDOWN
19 | }
20 |
21 | @dataclass
22 | class DocumentLineData:
23 | """
24 | Container for processed document line data with highlights.
25 | """
26 | original_line: str
27 | clean_text: str
28 | highlights: list[HighlightSpan]
29 |
30 |
31 | class SearchDocumentView(BaseCursesView):
32 | """
33 | Document viewer with markdown/raw/headers modes, scrolling support, and search highlighting.
34 | """
35 |
36 | def __init__(self, session: 'InteractiveSession'):
37 | """
38 | Initialize the document view.
39 |
40 | Args:
41 | session: The interactive session instance
42 | """
43 | super().__init__(session)
44 | self.__document: Optional[ResourceResult] = None
45 | self.__scroll_offset: int = 0
46 | self.__document_mode: DocumentMode = DocumentMode.MARKDOWN
47 | self.__cached_content_lines: Optional[list[str]] = None
48 | self.__cached_mode: Optional[DocumentMode] = None
49 | self.__cached_query: Optional[str] = None
50 | self.__search_terms: list[str] = []
51 |
52 | @property
53 | def document_mode(self) -> DocumentMode:
54 | return self.__document_mode
55 |
56 | @property
57 | def scroll_offset(self) -> int:
58 | return self.__scroll_offset
59 |
60 | @property
61 | def url(self) -> str:
62 | return self.__document.url if self.__document else ""
63 |
64 | def clear(self) -> None:
65 | """
66 | Clear the document.
67 | """
68 | self.__document = None
69 | self.__scroll_offset = 0
70 | self.__invalidate_cache()
71 |
72 | def draw_inner_footer(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
73 | """
74 | Draw document footer with scroll position and mode switcher.
75 |
76 | Args:
77 | stdscr: The curses window to draw on
78 | bounds: The view bounds defining the drawing area
79 | text: The footer text to display
80 | """
81 | if not self.__document:
82 | super().draw_inner_footer(stdscr, bounds, text)
83 | return
84 |
85 | style: int = self._get_inner_header_style()
86 | footer_y: int = bounds.y + bounds.height - 1
87 |
88 | terminal_height: int
89 | terminal_height, _ = stdscr.getmaxyx()
90 | if footer_y >= terminal_height:
91 | return
92 |
93 | content_lines: list[str] = self.__get_content_lines()
94 | content_height: int = max(0, bounds.height - 4)
95 | total_lines: int = len(content_lines)
96 | showing_start: int = self.__scroll_offset + 1
97 | showing_end: int = min(total_lines, self.__scroll_offset + content_height)
98 | left_info: str = f"Viewing lines {showing_start}-{showing_end} of {total_lines}"
99 | modes: list[tuple[str, DocumentMode]] = [
100 | (" MD ", DocumentMode.MARKDOWN),
101 | (" RAW ", DocumentMode.RAW),
102 | (" HDR ", DocumentMode.HEADERS)
103 | ]
104 |
105 | mode_buttons_width: int = sum(len(mode_name) for mode_name, _ in modes)
106 |
107 | mode_start_x: int = bounds.width - mode_buttons_width - 1
108 | document_mode_style: int = self.session.get_theme_color_pair(ThemeDefinition.DOCUMENT_MODE)
109 | safe_addstr(stdscr, footer_y, 0, self._get_bounded_line(), style)
110 | safe_addstr(stdscr, footer_y, 1, left_info, style)
111 | if mode_start_x > len(left_info) + 3:
112 | current_x: int = mode_start_x
113 | for mode_name, mode_enum in modes:
114 | is_current: bool = self.__document_mode == mode_enum
115 | mode_style: int = document_mode_style if is_current else style
116 | if current_x + len(mode_name) <= bounds.width:
117 | safe_addstr(stdscr, footer_y, current_x, mode_name, mode_style)
118 | current_x += len(mode_name)
119 |
120 | def handle_input(self, key: int) -> bool:
121 | """
122 | Handle document navigation input.
123 |
124 | Args:
125 | key: The curses key code from user input
126 |
127 | Returns:
128 | bool: True if the input was handled, False otherwise
129 | """
130 | if not self._focused or not self.__document:
131 | return False
132 |
133 | handlers: dict[int, callable] = {
134 | curses.KEY_UP: self.__scroll_up,
135 | curses.KEY_DOWN: self.__scroll_down,
136 | curses.KEY_LEFT: self.__jump_to_previous_highlight,
137 | curses.KEY_RIGHT: self.__jump_to_next_highlight,
138 | curses.KEY_PPAGE: lambda: self.__scroll_page_up(max(1, self.bounds.height - 4)),
139 | curses.KEY_NPAGE: lambda: self.__scroll_page_down(max(1, self.bounds.height - 4)),
140 | curses.KEY_HOME: self.__scroll_to_top,
141 | curses.KEY_END: self.__scroll_to_bottom,
142 | ord('\t'): self.__cycle_mode,
143 | }
144 |
145 | handler = handlers.get(key)
146 | if handler:
147 | handler()
148 | return True
149 |
150 | return False
151 |
152 | def render(self, stdscr: curses.window) -> None:
153 | """
154 | Render the document view within bounds with search highlighting.
155 |
156 | Args:
157 | stdscr: The curses window to draw on
158 | """
159 | if not self._renderable(stdscr):
160 | return
161 | if not self.__document:
162 | self.__render_no_document(stdscr)
163 | return
164 |
165 | xb: int = self.bounds.x
166 | yb: int = self.bounds.y
167 | y_current: int = yb + 2
168 | y_max: int = yb + self.bounds.height
169 |
170 | content_height: int = max(0, self.bounds.height - 4)
171 | content_width: int = self.bounds.width - 4
172 | content_lines: list[str] = self.__get_content_lines()
173 | visible_lines: list[str] = content_lines[self.__scroll_offset: self.__scroll_offset + content_height]
174 |
175 | self.__update_search_terms()
176 |
177 | for i, line in enumerate(visible_lines):
178 | line_y: int = y_current + i
179 | if line_y >= self.bounds.height:
180 | break
181 |
182 | if self.__search_terms and line.strip():
183 | self.__render_line_with_highlights(stdscr, line, line_y, 2, content_width)
184 | else:
185 | display_line: str = line[:content_width] if len(line) > content_width else line
186 | safe_addstr(stdscr, line_y, 2, display_line)
187 |
188 | def update(self, document: ResourceResult) -> None:
189 | """
190 | Update the document and reset scroll position.
191 |
192 | Args:
193 | document: The resource result document to display
194 | """
195 | self.__document = document
196 | self.__scroll_offset = 0
197 | self.__invalidate_cache()
198 |
199 | def __calculate_max_scroll(self) -> int:
200 | """
201 | Calculate maximum scroll offset based on content and view size.
202 |
203 | Returns:
204 | int: The maximum scroll offset value
205 | """
206 | if not self.__document:
207 | return 0
208 |
209 | content_lines: list[str] = self.__get_content_lines()
210 | content_height: int = max(0, self.bounds.height - 4)
211 |
212 | return max(0, len(content_lines) - content_height)
213 |
214 | def __cycle_mode(self) -> None:
215 | """
216 | Cycle to the next document mode.
217 | """
218 | self.__document_mode = DOCUMENT_MODE_NEXT.get(
219 | self.__document_mode,
220 | DocumentMode.MARKDOWN
221 | )
222 | self.__scroll_offset = 0
223 | self.__invalidate_cache()
224 |
225 | def __get_content_lines(self) -> list[str]:
226 | """
227 | Get content lines based on current mode with caching.
228 |
229 | Returns:
230 | list[str]: The content lines for the current document mode
231 | """
232 | current_query: str = self.session.searchform.query if hasattr(self.session, 'searchform') else ""
233 |
234 | if (self.__cached_content_lines is not None and
235 | self.__cached_mode == self.__document_mode and
236 | self.__cached_query == current_query):
237 | return self.__cached_content_lines
238 |
239 | if not self.__document:
240 | return []
241 |
242 | content_lines: list[str]
243 | if self.__document_mode == DocumentMode.MARKDOWN:
244 | content_lines = self.__get_markdown_lines()
245 | elif self.__document_mode == DocumentMode.RAW:
246 | content_lines = self.__get_raw_lines()
247 | elif self.__document_mode == DocumentMode.HEADERS:
248 | content_lines = self.__get_header_lines()
249 | else:
250 | content_lines = ["Unknown document mode"]
251 |
252 | self.__cached_content_lines = content_lines
253 | self.__cached_mode = self.__document_mode
254 | self.__cached_query = current_query
255 |
256 | return content_lines
257 |
258 | def __get_header_lines(self) -> list[str]:
259 | """
260 | Get headers with proper wrapping.
261 |
262 | Returns:
263 | list[str]: The wrapped header lines
264 | """
265 | if not self.__document.headers:
266 | return ["No headers available for this resource."]
267 |
268 | return self.__wrap_text_content(self.__document.headers)
269 |
270 | def __get_markdown_lines(self) -> list[str]:
271 | """
272 | Get markdown content with proper wrapping.
273 |
274 | Returns:
275 | list[str]: The wrapped markdown content lines
276 | """
277 | raw_markdown: str = self.__document.get_extra("markdown")
278 | if not raw_markdown:
279 | return ["", "Markdown unavailable for this resource."]
280 |
281 | return self.__wrap_text_content(raw_markdown)
282 |
283 | def __get_raw_lines(self) -> list[str]:
284 | """
285 | Get raw content with proper wrapping.
286 |
287 | Returns:
288 | list[str]: The wrapped raw content lines
289 | """
290 | if not self.__document.content:
291 | return ["No raw content available for this resource."]
292 |
293 | return self.__wrap_text_content(self.__document.content.strip())
294 |
295 | def __invalidate_cache(self) -> None:
296 | """
297 | Invalidate cached content lines.
298 | """
299 | self.__cached_content_lines = None
300 | self.__cached_mode = None
301 | self.__cached_query = None
302 |
303 | def __jump_to_next_highlight(self) -> None:
304 | """
305 | Jump to next highlight, positioning it at line 5 of screen.
306 | """
307 | if not self.__search_terms:
308 | return
309 |
310 | content_lines: list[str] = self.__get_content_lines()
311 | current_line: int = self.__scroll_offset + 3
312 |
313 | for line_num in range(current_line + 1, len(content_lines)):
314 | highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
315 | content_lines[line_num],
316 | self.__search_terms
317 | )
318 | if highlights:
319 | self.__scroll_offset = max(0, line_num - 3)
320 | return
321 |
322 | for line_num in range(0, current_line + 1):
323 | highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
324 | content_lines[line_num],
325 | self.__search_terms
326 | )
327 | if highlights:
328 | self.__scroll_offset = max(0, line_num - 3)
329 | return
330 |
331 | def __jump_to_previous_highlight(self) -> None:
332 | """
333 | Jump to previous highlight, positioning it at line 5 of screen.
334 | """
335 | if not self.__search_terms:
336 | return
337 |
338 | content_lines: list[str] = self.__get_content_lines()
339 | current_line: int = self.__scroll_offset + 3
340 |
341 | for line_num in range(current_line - 1, -1, -1):
342 | highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
343 | content_lines[line_num],
344 | self.__search_terms
345 | )
346 | if highlights:
347 | self.__scroll_offset = max(0, line_num - 3)
348 | return
349 |
350 | for line_num in range(len(content_lines) - 1, current_line - 1, -1):
351 | highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
352 | content_lines[line_num],
353 | self.__search_terms
354 | )
355 | if highlights:
356 | self.__scroll_offset = max(0, line_num - 3)
357 | return
358 |
359 | def __render_line_with_highlights(self, stdscr: curses.window, line: str, y: int, x: int, max_width: int) -> None:
360 | """
361 | Render a line with search term highlighting using the shared utility.
362 |
363 | Args:
364 | stdscr: The curses window to draw on
365 | line: The text line to render
366 | y: Y position to render at
367 | x: X position to render at
368 | max_width: Maximum width for rendering
369 | """
370 | if not line.strip():
371 | return
372 |
373 | highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(line, self.__search_terms)
374 | normal_style: int = curses.A_NORMAL
375 | highlight_style: int = self.session.get_theme_color_pair(ThemeDefinition.SNIPPET_HIGHLIGHT)
376 | HighlightProcessor.render_text_with_highlights(
377 | stdscr, line, highlights, x, y, max_width, normal_style, highlight_style
378 | )
379 |
380 | def __render_no_document(self, stdscr: curses.window) -> None:
381 | """
382 | Render message when no document is loaded.
383 |
384 | Args:
385 | stdscr: The curses window to draw on
386 | """
387 | x: int = self.bounds.x
388 | y: int = self.bounds.y
389 | width: int = self.bounds.width
390 | height: int = self.bounds.height
391 |
392 | if height > 2 and width > 20:
393 | safe_addstr(stdscr, y + 2, x + 2, "No document loaded.", curses.A_DIM)
394 |
395 | def __scroll_down(self, lines: int = 1) -> None:
396 | """
397 | Scroll down by specified number of lines.
398 |
399 | Args:
400 | lines: Number of lines to scroll down
401 | """
402 | max_scroll: int = self.__calculate_max_scroll()
403 | self.__scroll_offset = min(max_scroll, self.__scroll_offset + lines)
404 |
405 | def __scroll_page_down(self, page_size: int = 10) -> None:
406 | """
407 | Scroll down by page.
408 |
409 | Args:
410 | page_size: Number of lines to scroll for a page
411 | """
412 | self.__scroll_down(page_size)
413 |
414 | def __scroll_page_up(self, page_size: int = 10) -> None:
415 | """
416 | Scroll up by page.
417 |
418 | Args:
419 | page_size: Number of lines to scroll for a page
420 | """
421 | self.__scroll_up(page_size)
422 |
423 | def __scroll_to_bottom(self) -> None:
424 | """
425 | Scroll to bottom of document.
426 | """
427 | self.__scroll_offset = self.__calculate_max_scroll()
428 |
429 | def __scroll_to_top(self) -> None:
430 | """
431 | Scroll to top of document.
432 | """
433 | self.__scroll_offset = 0
434 |
435 | def __scroll_up(self, lines: int = 1) -> None:
436 | """
437 | Scroll up by specified number of lines.
438 |
439 | Args:
440 | lines: Number of lines to scroll up
441 | """
442 | self.__scroll_offset = max(0, self.__scroll_offset - lines)
443 |
444 | def __update_search_terms(self) -> None:
445 | """
446 | Update search terms from current search form query using shared utility.
447 | """
448 | if hasattr(self.session, 'searchform') and self.session.searchform:
449 | query: str = self.session.searchform.query
450 | self.__search_terms = HighlightProcessor.extract_search_terms(query)
451 | else:
452 | self.__search_terms = []
453 |
454 | def __wrap_text_content(self, raw_text: str) -> list[str]:
455 | """
456 | Wrap text content for display with proper line handling.
457 |
458 | Args:
459 | raw_text: The raw text content to wrap
460 |
461 | Returns:
462 | list[str]: The wrapped text lines
463 | """
464 | if not raw_text:
465 | return []
466 |
467 | content_width: int = max(20, self.bounds.width - CONTENT_MARGIN)
468 | wrapped_lines: list[str] = []
469 | text_lines: list[str] = raw_text.split("\n")
470 |
471 | for line in text_lines:
472 | if not line.strip():
473 | wrapped_lines.append("")
474 | else:
475 | wrapped: str = textwrap.fill(
476 | line.rstrip(),
477 | width=content_width,
478 | expand_tabs=True,
479 | replace_whitespace=False,
480 | break_long_words=True,
481 | break_on_hyphens=True
482 | )
483 | wrapped_lines.extend(wrapped.split("\n"))
484 |
485 | return wrapped_lines
486 |
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.crawlers.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>mcp_server_webcrawl.crawlers package — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="next" title="mcp_server_webcrawl.crawlers.base package" href="mcp_server_webcrawl.crawlers.base.html" />
23 | <link rel="prev" title="mcp_server_webcrawl package" href="mcp_server_webcrawl.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
50 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
51 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
52 | <li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
53 | <li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
54 | </ul>
55 | </li>
56 | </ul>
57 |
58 | </div>
59 | </div>
60 | </nav>
61 |
62 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
63 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
64 | <a href="index.html">mcp-server-webcrawl</a>
65 | </nav>
66 |
67 | <div class="wy-nav-content">
68 | <div class="rst-content">
69 | <div role="navigation" aria-label="Page navigation">
70 | <ul class="wy-breadcrumbs">
71 | <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
72 | <li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
73 | <li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
74 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers package</li>
75 | <li class="wy-breadcrumbs-aside">
76 | <a href="_sources/mcp_server_webcrawl.crawlers.rst.txt" rel="nofollow"> View page source</a>
77 | </li>
78 | </ul>
79 | <hr/>
80 | </div>
81 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
82 | <div itemprop="articleBody">
83 |
84 | <section id="mcp-server-webcrawl-crawlers-package">
85 | <h1>mcp_server_webcrawl.crawlers package<a class="headerlink" href="#mcp-server-webcrawl-crawlers-package" title="Link to this heading"></a></h1>
86 | <section id="subpackages">
87 | <h2>Subpackages<a class="headerlink" href="#subpackages" title="Link to this heading"></a></h2>
88 | <div class="toctree-wrapper compound">
89 | <ul>
90 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html">mcp_server_webcrawl.crawlers.base package</a><ul>
91 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#submodules">Submodules</a></li>
92 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.adapter">mcp_server_webcrawl.crawlers.base.adapter module</a></li>
93 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.api">mcp_server_webcrawl.crawlers.base.api module</a></li>
94 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.crawler">mcp_server_webcrawl.crawlers.base.crawler module</a></li>
95 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.indexed">mcp_server_webcrawl.crawlers.base.indexed module</a></li>
96 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.tests">mcp_server_webcrawl.crawlers.base.tests module</a></li>
97 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base">Module contents</a></li>
98 | </ul>
99 | </li>
100 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html">mcp_server_webcrawl.crawlers.archivebox package</a><ul>
101 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#submodules">Submodules</a></li>
102 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox.adapter">mcp_server_webcrawl.crawlers.archivebox.adapter module</a></li>
103 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox.crawler">mcp_server_webcrawl.crawlers.archivebox.crawler module</a></li>
104 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox.tests">mcp_server_webcrawl.crawlers.archivebox.tests module</a></li>
105 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox">Module contents</a></li>
106 | </ul>
107 | </li>
108 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html">mcp_server_webcrawl.crawlers.httrack package</a><ul>
109 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#submodules">Submodules</a></li>
110 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack.adapter">mcp_server_webcrawl.crawlers.httrack.adapter module</a></li>
111 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack.crawler">mcp_server_webcrawl.crawlers.httrack.crawler module</a></li>
112 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack.tests">mcp_server_webcrawl.crawlers.httrack.tests module</a></li>
113 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack">Module contents</a></li>
114 | </ul>
115 | </li>
116 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html">mcp_server_webcrawl.crawlers.interrobot package</a><ul>
117 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#submodules">Submodules</a></li>
118 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot.adapter">mcp_server_webcrawl.crawlers.interrobot.adapter module</a></li>
119 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot.crawler">mcp_server_webcrawl.crawlers.interrobot.crawler module</a></li>
120 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot.tests">mcp_server_webcrawl.crawlers.interrobot.tests module</a></li>
121 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot">Module contents</a></li>
122 | </ul>
123 | </li>
124 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html">mcp_server_webcrawl.crawlers.katana package</a><ul>
125 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#submodules">Submodules</a></li>
126 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana.adapter">mcp_server_webcrawl.crawlers.katana.adapter module</a></li>
127 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana.crawler">mcp_server_webcrawl.crawlers.katana.crawler module</a></li>
128 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana.tests">mcp_server_webcrawl.crawlers.katana.tests module</a></li>
129 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana">Module contents</a></li>
130 | </ul>
131 | </li>
132 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html">mcp_server_webcrawl.crawlers.siteone package</a><ul>
133 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#submodules">Submodules</a></li>
134 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone.adapter">mcp_server_webcrawl.crawlers.siteone.adapter module</a></li>
135 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone.crawler">mcp_server_webcrawl.crawlers.siteone.crawler module</a></li>
136 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone.tests">mcp_server_webcrawl.crawlers.siteone.tests module</a></li>
137 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone">Module contents</a></li>
138 | </ul>
139 | </li>
140 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html">mcp_server_webcrawl.crawlers.warc package</a><ul>
141 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#submodules">Submodules</a></li>
142 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc.adapter">mcp_server_webcrawl.crawlers.warc.adapter module</a></li>
143 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc.crawler">mcp_server_webcrawl.crawlers.warc.crawler module</a></li>
144 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc.tests">mcp_server_webcrawl.crawlers.warc.tests module</a></li>
145 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc">Module contents</a></li>
146 | </ul>
147 | </li>
148 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html">mcp_server_webcrawl.crawlers.wget package</a><ul>
149 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#submodules">Submodules</a></li>
150 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget.adapter">mcp_server_webcrawl.crawlers.wget.adapter module</a></li>
151 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget.crawler">mcp_server_webcrawl.crawlers.wget.crawler module</a></li>
152 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget.tests">mcp_server_webcrawl.crawlers.wget.tests module</a></li>
153 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget">Module contents</a></li>
154 | </ul>
155 | </li>
156 | </ul>
157 | </div>
158 | </section>
159 | <section id="module-mcp_server_webcrawl.crawlers">
160 | <span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers" title="Link to this heading"></a></h2>
161 | <dl class="py function">
162 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.get_fixture_directory">
163 | <span class="sig-name descname"><span class="pre">get_fixture_directory</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers.html#get_fixture_directory"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.get_fixture_directory" title="Link to this definition"></a></dt>
164 | <dd><dl class="field-list simple">
165 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
166 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a></p>
167 | </dd>
168 | </dl>
169 | </dd></dl>
170 |
171 | <dl class="py function">
172 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.get_crawler">
173 | <span class="sig-name descname"><span class="pre">get_crawler</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler_name</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers.html#get_crawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.get_crawler" title="Link to this definition"></a></dt>
174 | <dd><p>lazy load crawler, some classes have additional package dependencies</p>
175 | <dl class="field-list simple">
176 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
177 | <dd class="field-odd"><p><strong>crawler_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p>
178 | </dd>
179 | <dt class="field-even">Return type<span class="colon">:</span></dt>
180 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a> | None</p>
181 | </dd>
182 | </dl>
183 | </dd></dl>
184 |
185 | </section>
186 | </section>
187 |
188 |
189 | </div>
190 | </div>
191 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
192 | <a href="mcp_server_webcrawl.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
193 | <a href="mcp_server_webcrawl.crawlers.base.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.crawlers.base package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
194 | </div>
195 |
196 | <hr/>
197 |
198 | <div role="contentinfo">
199 | <p>© Copyright 2025, pragmar.</p>
200 | </div>
201 |
202 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
203 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
204 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
205 |
206 |
207 | </footer>
208 | </div>
209 | </div>
210 | </section>
211 | </div>
212 | <script>
213 | jQuery(function () {
214 | SphinxRtdTheme.Navigation.enable(true);
215 | });
216 | </script>
217 |
218 | </body>
219 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/interrobot/crawler.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.interrobot.crawler — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../../../../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
67 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
68 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.interrobot.crawler</li>
69 | <li class="wy-breadcrumbs-aside">
70 | </li>
71 | </ul>
72 | <hr/>
73 | </div>
74 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
75 | <div itemprop="articleBody">
76 |
77 | <h1>Source code for mcp_server_webcrawl.crawlers.interrobot.crawler</h1><div class="highlight"><pre>
78 | <span></span>
79 | <span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
80 |
81 | <span class="kn">from</span> <span class="nn">mcp.types</span> <span class="kn">import</span> <span class="n">Tool</span>
82 |
83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.sites</span> <span class="kn">import</span> <span class="n">SiteResult</span>
84 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.resources</span> <span class="kn">import</span> <span class="p">(</span>
85 | <span class="n">RESOURCES_FIELDS_DEFAULT</span><span class="p">,</span>
86 | <span class="n">RESOURCES_FIELDS_BASE</span><span class="p">,</span>
87 | <span class="n">RESOURCES_DEFAULT_SORT_MAPPING</span><span class="p">,</span>
88 | <span class="n">RESOURCES_FIELDS_OPTIONS</span><span class="p">,</span>
89 | <span class="p">)</span>
90 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.crawler</span> <span class="kn">import</span> <span class="n">BaseCrawler</span>
91 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.interrobot.adapter</span> <span class="kn">import</span> <span class="p">(</span>
92 | <span class="n">get_sites</span><span class="p">,</span>
93 | <span class="n">get_resources</span><span class="p">,</span>
94 | <span class="n">INTERROBOT_RESOURCE_FIELD_MAPPING</span><span class="p">,</span>
95 | <span class="n">INTERROBOT_SITE_FIELD_MAPPING</span><span class="p">,</span>
96 | <span class="n">INTERROBOT_SITE_FIELD_REQUIRED</span><span class="p">,</span>
97 | <span class="p">)</span>
98 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.tools</span> <span class="kn">import</span> <span class="n">get_crawler_tools</span>
99 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
100 |
101 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
102 |
103 |
104 | <div class="viewcode-block" id="InterroBotCrawler">
105 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.crawler.InterroBotCrawler">[docs]</a>
106 | <span class="k">class</span> <span class="nc">InterroBotCrawler</span><span class="p">(</span><span class="n">BaseCrawler</span><span class="p">):</span>
107 | <span class="w"> </span><span class="sd">"""</span>
108 | <span class="sd"> A crawler implementation for InterroBot data sources.</span>
109 | <span class="sd"> Provides functionality for accessing and searching web content from InterroBot.</span>
110 | <span class="sd"> """</span>
111 |
112 | <div class="viewcode-block" id="InterroBotCrawler.__init__">
113 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.crawler.InterroBotCrawler.__init__">[docs]</a>
114 | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
115 | <span class="bp">self</span><span class="p">,</span>
116 | <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">,</span>
117 | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
118 | <span class="w"> </span><span class="sd">"""</span>
119 | <span class="sd"> Initialize the InterroBotCrawler with a data source path and required adapter functions.</span>
120 |
121 | <span class="sd"> Args:</span>
122 | <span class="sd"> datasrc: Path to the data source</span>
123 | <span class="sd"> """</span>
124 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">,</span> <span class="n">resource_field_mapping</span><span class="o">=</span><span class="n">INTERROBOT_RESOURCE_FIELD_MAPPING</span><span class="p">)</span>
125 | <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_file</span><span class="p">()</span> <span class="ow">and</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">suffix</span> <span class="o">==</span> <span class="s2">".db"</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2"> datasrc must be a db file"</span></div>
126 |
127 |
128 | <div class="viewcode-block" id="InterroBotCrawler.mcp_list_tools">
129 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.crawler.InterroBotCrawler.mcp_list_tools">[docs]</a>
130 | <span class="k">async</span> <span class="k">def</span> <span class="nf">mcp_list_tools</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">list</span><span class="p">[</span><span class="n">Tool</span><span class="p">]:</span>
131 | <span class="w"> </span><span class="sd">"""</span>
132 | <span class="sd"> List available tools for this crawler.</span>
133 |
134 | <span class="sd"> Returns:</span>
135 | <span class="sd"> List of Tool objects</span>
136 | <span class="sd"> """</span>
137 | <span class="c1"># get the default crawler tools, then override necessary fields</span>
138 | <span class="n">all_sites</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">SiteResult</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_adapter_get_sites</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
139 | <span class="n">all_sites_ids</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">id</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">all_sites</span> <span class="k">if</span> <span class="n">s</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">s</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="nb">int</span><span class="p">)]</span>
140 | <span class="n">default_tools</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Tool</span><span class="p">]</span> <span class="o">=</span> <span class="n">get_crawler_tools</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="n">all_sites</span><span class="p">)</span>
141 | <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">default_tools</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">,</span> <span class="s2">"expected exactly 2 Tools: sites and resources"</span>
142 |
143 | <span class="c1"># can replace get_crawler_tools or extend, here it is overwritten from default</span>
144 | <span class="c1"># you'd think maybe pass changes in, but no, it's better ad hoc</span>
145 | <span class="n">default_sites_tool</span><span class="p">:</span> <span class="n">Tool</span>
146 | <span class="n">default_resources_tool</span><span class="p">:</span> <span class="n">Tool</span>
147 | <span class="n">default_sites_tool</span><span class="p">,</span> <span class="n">default_resources_tool</span> <span class="o">=</span> <span class="n">default_tools</span>
148 | <span class="n">sites_field_options</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">INTERROBOT_SITE_FIELD_MAPPING</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span> <span class="o">-</span> <span class="nb">set</span><span class="p">(</span><span class="n">INTERROBOT_SITE_FIELD_REQUIRED</span><span class="p">))</span>
149 | <span class="n">dst_props</span><span class="p">:</span> <span class="nb">dict</span> <span class="o">=</span> <span class="n">default_sites_tool</span><span class="o">.</span><span class="n">inputSchema</span><span class="p">[</span><span class="s2">"properties"</span><span class="p">]</span>
150 | <span class="n">dst_props</span><span class="p">[</span><span class="s2">"fields"</span><span class="p">][</span><span class="s2">"items"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">sites_field_options</span>
151 |
152 | <span class="n">resources_sort_options</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">RESOURCES_DEFAULT_SORT_MAPPING</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
153 | <span class="n">all_sites_display</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">s</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2"> (site: </span><span class="si">{</span><span class="n">s</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">)"</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">all_sites</span><span class="p">])</span>
154 |
155 | <span class="n">drt_props</span><span class="p">:</span> <span class="nb">dict</span> <span class="o">=</span> <span class="n">default_resources_tool</span><span class="o">.</span><span class="n">inputSchema</span><span class="p">[</span><span class="s2">"properties"</span><span class="p">]</span>
156 | <span class="n">drt_props</span><span class="p">[</span><span class="s2">"fields"</span><span class="p">][</span><span class="s2">"items"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">RESOURCES_FIELDS_OPTIONS</span>
157 | <span class="n">drt_props</span><span class="p">[</span><span class="s2">"sort"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">resources_sort_options</span>
158 | <span class="n">drt_props</span><span class="p">[</span><span class="s2">"sites"</span><span class="p">][</span><span class="s2">"items"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">all_sites_ids</span>
159 | <span class="n">drt_props</span><span class="p">[</span><span class="s2">"sites"</span><span class="p">][</span><span class="s2">"description"</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">"Optional "</span>
160 | <span class="s2">"list of project ID to filter search results to a specific site. In 95% "</span>
161 | <span class="s2">"of scenarios, you'd filter to only one site, but many site filtering is offered "</span>
162 | <span class="sa">f</span><span class="s2">"for advanced search scenarios. Available sites include </span><span class="si">{</span><span class="n">all_sites_display</span><span class="si">}</span><span class="s2">."</span><span class="p">)</span>
163 |
164 | <span class="k">return</span> <span class="p">[</span><span class="n">default_sites_tool</span><span class="p">,</span> <span class="n">default_resources_tool</span><span class="p">]</span></div>
165 | </div>
166 |
167 | </pre></div>
168 |
169 | </div>
170 | </div>
171 | <footer>
172 |
173 | <hr/>
174 |
175 | <div role="contentinfo">
176 | <p>© Copyright 2025, pragmar.</p>
177 | </div>
178 |
179 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
180 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
181 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
182 |
183 |
184 | </footer>
185 | </div>
186 | </div>
187 | </section>
188 | </div>
189 | <script>
190 | jQuery(function () {
191 | SphinxRtdTheme.Navigation.enable(true);
192 | });
193 | </script>
194 |
195 | </body>
196 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/crawler.py:
--------------------------------------------------------------------------------
```python
1 | import anyio
2 | import re
3 | import sqlite3
4 | import traceback
5 |
6 | from pathlib import Path
7 | from typing import Any, Callable, Final
8 | from urllib.parse import urlparse
9 |
10 | from mcp.server import NotificationOptions, Server
11 | from mcp.server.models import InitializationOptions
12 | from mcp.types import EmbeddedResource, ImageContent, TextContent, Tool
13 |
14 | from mcp_server_webcrawl.crawlers.base.api import BaseJsonApi
15 | from mcp_server_webcrawl.crawlers.base.adapter import IndexState
16 | from mcp_server_webcrawl.models.base import METADATA_VALUE_TYPE
17 | from mcp_server_webcrawl.models.sites import SITES_TOOL_NAME
18 | from mcp_server_webcrawl.models.resources import (
19 | ResourceResult,
20 | ResourceResultType,
21 | RESOURCES_DEFAULT_FIELD_MAPPING,
22 | RESOURCE_EXTRAS_ALLOWED,
23 | RESOURCES_TOOL_NAME,
24 | )
25 | from mcp_server_webcrawl.extras.thumbnails import ThumbnailManager
26 | from mcp_server_webcrawl.extras.markdown import get_markdown
27 | from mcp_server_webcrawl.extras.regex import get_regex
28 | from mcp_server_webcrawl.extras.snippets import get_snippets
29 | from mcp_server_webcrawl.extras.xpath import get_xpath
30 |
31 | from mcp_server_webcrawl.utils.logger import get_logger
32 |
33 | OVERRIDE_ERROR_MESSAGE: Final[str] = """BaseCrawler subclasses must implement the following \
34 | methods: handle_list_tools, handle_call_tool, at minimum."""
35 |
36 | logger = get_logger()
37 |
38 |
39 | class BaseCrawler:
40 | """
41 | Base crawler class that implements MCP server functionality.
42 |
43 | This class provides the foundation for specialized crawlers to interact with
44 | the MCP server and handle tool operations for web resources.
45 | """
46 |
47 | def __init__(
48 | self,
49 | datasrc: Path,
50 | get_sites_func: Callable,
51 | get_resources_func: Callable,
52 | resource_field_mapping: dict[str, str] = RESOURCES_DEFAULT_FIELD_MAPPING,
53 | ) -> None:
54 | """
55 | Initialize the BaseCrawler with a data source path and required adapter functions.
56 |
57 | Args:
58 | datasrc: path to the data source
59 | get_sites_func: function to retrieve sites from the data source
60 | get_resources_func: function to retrieve resources from the data source
61 | resource_field_mapping: mapping of resource field names to display names
62 | """
63 |
64 | from mcp_server_webcrawl import __name__ as module_name, __version__ as module_version
65 |
66 | assert datasrc is not None, f"{self.__class__.__name__} needs a datasrc, regardless of action"
67 | assert callable(get_sites_func), f"{self.__class__.__name__} requires a callable get_sites_func"
68 | assert callable(get_resources_func), f"{self.__class__.__name__} requires a callable get_resources_func"
69 | assert isinstance(resource_field_mapping, dict), f"{self.__class__.__name__} resource_field_mapping must be a dict"
70 |
71 | self._datasrc: Path = Path(datasrc)
72 |
73 | self._module_name: str = module_name
74 | self._module_version: str = module_version
75 |
76 | self._server = Server(self._module_name)
77 | self._server.list_tools()(self.mcp_list_tools)
78 | self._server.call_tool()(self.mcp_call_tool)
79 | self._server.list_prompts()(self.mcp_list_prompts)
80 | self._server.list_resources()(self.mcp_list_resources)
81 |
82 | self._resource_field_mapping = resource_field_mapping
83 | self._adapter_get_sites = get_sites_func
84 | self._adapter_get_resources = get_resources_func
85 |
86 | @property
87 | def datasrc(self) -> Path:
88 | return self._datasrc
89 |
90 | async def mcp_list_prompts(self) -> list:
91 | """List available prompts (currently none)."""
92 | return []
93 |
94 | async def mcp_list_resources(self) -> list:
95 | """List available resources (currently none)."""
96 | return []
97 |
98 | async def serve(self, stdin: anyio.AsyncFile[str] | None, stdout: anyio.AsyncFile[str] | None) -> dict[str, Any]:
99 | """
100 | Launch the awaitable server.
101 |
102 | Args:
103 | stdin: input stream for the server
104 | stdout: output stream for the server
105 |
106 | Returns:
107 | The MCP server over stdio
108 | """
109 | return await self._server.run(stdin, stdout, self.get_initialization_options())
110 |
111 | def get_initialization_options(self) -> InitializationOptions:
112 | """
113 | Get the MCP initialization object.
114 |
115 | Returns:
116 | Dictionary containing project information
117 | """
118 | notification_events = NotificationOptions(prompts_changed=False, resources_changed=False, tools_changed=False)
119 | capabilities = self._server.get_capabilities(notification_options=notification_events, experimental_capabilities={})
120 | return InitializationOptions(server_name=self._module_name, server_version=self._module_version, capabilities=capabilities)
121 |
122 | def get_sites_api_json(self, **kwargs) -> str:
123 | """
124 | Get sites API result as JSON.
125 |
126 | Returns:
127 | JSON string of sites API results
128 | """
129 | json_result = self.get_sites_api(**kwargs)
130 | return json_result.to_json()
131 |
132 | def get_resources_api_json(self, **kwargs) -> str:
133 | """
134 | Get resources API result as JSON.
135 |
136 | Returns:
137 | JSON string of resources API results
138 | """
139 | json_result = self.get_resources_api(**kwargs)
140 | return json_result.to_json()
141 |
142 | def get_sites_api(
143 | self,
144 | ids: list[int] | None = None,
145 | fields: list[str] | None = None,
146 | ) -> BaseJsonApi:
147 | sites = self._adapter_get_sites(self._datasrc, ids=ids, fields=fields)
148 | sites_kwargs = {
149 | "ids": ids,
150 | "fields": fields,
151 | }
152 | json_result = BaseJsonApi("GetProjects", sites_kwargs)
153 | json_result.set_results(sites, len(sites), 0, len(sites))
154 | return json_result
155 |
156 | def get_resources_api(
157 | self,
158 | sites: list[int] | None = None,
159 | query: str = "",
160 | fields: list[str] | None = None,
161 | sort: str | None = None,
162 | limit: int = 20,
163 | offset: int = 0,
164 | extras: list[str] | None = None,
165 | extrasRegex: list[str] | None = None,
166 | extrasXpath: list[str] | None = None,
167 | ) -> BaseJsonApi:
168 | resources_kwargs: dict[str, METADATA_VALUE_TYPE] = {
169 | "sites": sites,
170 | "query": query,
171 | "fields": fields,
172 | "sort": sort,
173 | "limit": limit,
174 | "offset": offset,
175 | }
176 |
177 | def no_results() -> BaseJsonApi:
178 | api_result = BaseJsonApi("GetResources", resources_kwargs, index_state=IndexState())
179 | api_result.set_results([], 0, 0, limit)
180 | return api_result
181 |
182 | if not sites:
183 | all_sites = self._adapter_get_sites(self._datasrc)
184 | if not all_sites:
185 | return no_results()
186 | # set to default of all sites if not specified
187 | sites = [site.id for site in all_sites]
188 |
189 | # sometimes the AI gets it in its head this is a good idea
190 | # but it means no query, just take care of it here
191 | if query.strip() in ('""',"''", "``", "*"):
192 | query = ""
193 |
194 | site_matches = self._adapter_get_sites(self._datasrc, ids=sites)
195 | if not site_matches:
196 | return no_results()
197 |
198 | extras = extras or []
199 | extrasXpath = extrasXpath or []
200 | extrasRegex = extrasRegex or []
201 | fields = fields or []
202 | fields_extras_override: list[str] = fields.copy()
203 |
204 | set_extras: set[str] = set(extras)
205 | set_extras_content: set[str] = {"markdown", "snippets", "xpath", "regex"}
206 | set_extras_headers: set[str] = {"snippets", "regex"}
207 | add_content: bool = bool(set_extras_content & set_extras)
208 | add_headers: bool = bool(set_extras_headers & set_extras)
209 |
210 | if add_content and "content" not in fields:
211 | fields_extras_override.append("content")
212 |
213 | if add_headers and "headers" not in fields:
214 | fields_extras_override.append("headers")
215 |
216 | results, total, index_state = self._adapter_get_resources(
217 | self._datasrc,
218 | sites=sites,
219 | query=query,
220 | fields=fields_extras_override,
221 | sort=sort,
222 | limit=limit,
223 | offset=offset,
224 | )
225 |
226 | if "markdown" in extras:
227 | result: ResourceResult
228 | for result in results:
229 | markdown_result: str | None = None
230 | if result.type == ResourceResultType.PAGE:
231 | markdown_result = get_markdown(result.content)
232 | result.set_extra("markdown", markdown_result)
233 |
234 | if "xpath" in extras:
235 | result: ResourceResult
236 | for result in results:
237 | xpath_result: list[dict[str, str | int | float]] = get_xpath(result.content, extrasXpath)
238 | result.set_extra("xpath", xpath_result)
239 |
240 | if "regex" in extras:
241 | result: ResourceResult
242 | for result in results:
243 | regex_result: list[dict[str, str | int | float]] = get_regex(result.headers, result.content, extrasRegex)
244 | result.set_extra("regex", regex_result)
245 |
246 | if "snippets" in extras and query.strip():
247 | result: ResourceResult
248 | for result in results:
249 | snippets: str | None = get_snippets(result.url, result.headers, result.content, query)
250 | result.set_extra("snippets", snippets)
251 |
252 | extras_only_fields = set(fields_extras_override) - set(fields)
253 | if extras_only_fields:
254 | for result in results:
255 | for field in extras_only_fields:
256 | if hasattr(result, field):
257 | setattr(result, field, None)
258 |
259 | # note: thumbnails extra a special case, handled in mcp_call_tool
260 | api_result = BaseJsonApi("GetResources", resources_kwargs, index_state=index_state)
261 | api_result.set_results(results, total, offset, limit)
262 | return api_result
263 |
264 | async def mcp_list_tools(self) -> list[Tool]:
265 | """
266 | List available tools.
267 |
268 | Returns:
269 | List of available tools
270 |
271 | Raises:
272 | NotImplementedError: This method must be implemented by subclasses
273 | """
274 | # each crawler subclass must provide this method
275 | raise NotImplementedError(OVERRIDE_ERROR_MESSAGE)
276 |
277 | async def mcp_call_tool(self, name: str, arguments: dict[str, Any] | None
278 | ) -> list[TextContent | ImageContent | EmbeddedResource]:
279 | """
280 | Handle tool execution requests. You can override this or super(), then tweak.
281 | Basically, it is a passthrough.
282 |
283 | Args:
284 | name: name of the tool to call
285 | arguments: arguments to pass to the tool
286 |
287 | Returns:
288 | List of content objects resulting from the tool execution
289 |
290 | Raises:
291 | ValueError: If the specified tool does not exist
292 | """
293 | try:
294 | if name == SITES_TOOL_NAME:
295 | ids: list[int] = [] if not arguments or "ids" not in arguments else arguments["ids"]
296 | fields: list[str] = [] if not arguments or "fields" not in arguments else arguments["fields"]
297 |
298 | assert isinstance(ids, list) and all(isinstance(item, int) for item in ids)
299 | assert isinstance(fields, list) and all(isinstance(item, str) for item in fields)
300 |
301 | results_json = self.get_sites_api_json(
302 | ids=ids,
303 | fields=fields
304 | )
305 | return [TextContent(type="text", text=results_json)]
306 |
307 | elif name == RESOURCES_TOOL_NAME:
308 |
309 | extras: list[str] = [] if not arguments or "extras" not in arguments else arguments["extras"]
310 |
311 | # in case there is any LLM confusion of XPath/xpath or Markdown/markdown, these are
312 | # defined lowercase in the MCP Tool definition, but have counter-weighting as proper nouns
313 | extras = [extra.lower() for extra in extras if isinstance(extra, str)]
314 | extrasRegex: list[str] = [] if not arguments or "extrasRegex" not in arguments else arguments["extrasRegex"]
315 | extrasXpath: list[str] = [] if not arguments or "extrasXpath" not in arguments else arguments["extrasXpath"]
316 |
317 | extras_set: set[str] = set(extras)
318 | extras_removed: set[str] = extras_set - RESOURCE_EXTRAS_ALLOWED
319 | if extras_removed:
320 | # only allow known extras
321 | extras = list(RESOURCE_EXTRAS_ALLOWED.intersection(extras))
322 |
323 | # regular args pass through to the result
324 | query: str = "" if not arguments or "query" not in arguments else arguments["query"]
325 | fields: list[str] = [] if not arguments or "fields" not in arguments else arguments["fields"]
326 | sites: list[int] = [] if not arguments or "sites" not in arguments else arguments["sites"]
327 | sort: str | None = None if not arguments or "sort" not in arguments else arguments["sort"]
328 | limit: int = 20 if not arguments or "limit" not in arguments else arguments["limit"]
329 | offset: int = 0 if not arguments or "offset" not in arguments else arguments["offset"]
330 |
331 | # claude keeps getting this wrong, it is properly enumerated in Tool definition
332 | clean_sort = sort.strip("\"'`") if isinstance(sort, str) else None
333 |
334 | assert isinstance(query, str)
335 | assert isinstance(fields, list) and all(isinstance(item, str) for item in fields)
336 | assert isinstance(sites, list) and all(isinstance(item, int) for item in sites)
337 | assert isinstance(sort, (str, type(None)))
338 | assert isinstance(limit, int)
339 | assert isinstance(offset, int)
340 | assert isinstance(extras, list) and all(isinstance(item, str) for item in extras)
341 | assert isinstance(extrasXpath, list) and all(isinstance(item, str) for item in extrasXpath)
342 |
343 | api_result: BaseJsonApi = self.get_resources_api(
344 | sites=sites,
345 | query=query,
346 | fields=fields,
347 | sort=clean_sort,
348 | limit=limit,
349 | offset=offset,
350 | extras=extras,
351 | extrasRegex=extrasRegex,
352 | extrasXpath=extrasXpath,
353 | )
354 | # sometimes nudging makes things worse, AI doubles down on percieved
355 | # rightousness of position. just let it have it. claims in the end it's
356 | # a JSON encoding confusion with the +/- leading char. who knows? more
357 | # importantly, who cares? play it loose.
358 | # if sort != clean_sort:
359 | # # let the MCP host know the error of its ways
360 | # api_result.append_error(f"invalid sort ({sort}) requested [{', '.join(RESOURCES_DEFAULT_SORT_MAPPING.keys())}]")
361 | if extras_removed:
362 | # only allow known extras
363 | api_result.append_error(f"invalid extras requested ({', '.join(extras_removed)})")
364 |
365 |
366 | crawl_results: list[ResourceResult] = api_result.get_results()
367 | results_json = api_result.to_json()
368 | mcp_result = [TextContent(type="text", text=results_json)]
369 |
370 | if "thumbnails" in extras:
371 | crawl_results: list[ResourceResult] = api_result.get_results()
372 | mcp_result += self.get_thumbnails(crawl_results) or []
373 |
374 | return mcp_result
375 | else:
376 | raise ValueError(f"No such tool ({name})")
377 |
378 | except sqlite3.Error as ex:
379 | return [TextContent(type="text", text=f"mcp_call_tool/database\n{str(ex)}\n{traceback.format_exc()}")]
380 | except Exception as ex:
381 | return [TextContent(type="text", text=f"mcp_call_tool/exception\n{str(ex)}\n{traceback.format_exc()}")]
382 |
383 | def get_thumbnails(self, results: list[ResourceResult]) -> list[ImageContent]:
384 |
385 | thumbnails_result: list[ImageContent] = []
386 | image_paths = list(set([result.url for result in results if result.url and
387 | result.type == ResourceResultType.IMAGE]))
388 | valid_paths = []
389 | for path in image_paths:
390 | parsed = urlparse(path)
391 | if parsed.scheme in ("http", "https") and parsed.netloc:
392 | clean_path: str = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
393 | valid_paths.append(clean_path)
394 | elif re.search(r"\.(jpg|jpeg|png|gif|bmp|webp)$", path, re.IGNORECASE):
395 | clean_path: str = path.split("?")[0]
396 | valid_paths.append(clean_path)
397 |
398 | if valid_paths:
399 | try:
400 | thumbnail_manager = ThumbnailManager()
401 | thumbnail_data = thumbnail_manager.get_thumbnails(valid_paths)
402 | for thumbnail_url, thumbnail_base64 in thumbnail_data.items():
403 | if thumbnail_base64 is None:
404 | logger.debug(f"Thumbnail encountered error during request. {thumbnail_url}")
405 | continue
406 | image_content = ImageContent(type="image", data=thumbnail_base64, mimeType="image/webp")
407 | thumbnails_result.append(image_content)
408 | logger.debug(f"Fetched {len(thumbnail_data)} thumbnails out of {len(valid_paths)} requested URLs")
409 | except Exception as ex:
410 | logger.error(f"Error fetching thumbnails: {ex}\n{traceback.format_exc()}")
411 |
412 | return thumbnails_result
413 |
414 | def _convert_to_resource_types(self, types: list[str] | None) -> list[ResourceResultType] | None:
415 | """
416 | Convert string type values to ResourceResultType enums. Silently ignore invalid type strings.
417 |
418 | Args:
419 | types: optional list of string type values
420 |
421 | Returns:
422 | Optional list of ResourceResultType enums, or None if no valid types
423 | """
424 | if not types:
425 | return None
426 |
427 | result = [rt for rt in ResourceResultType if rt.value in types]
428 | return result if result else None
429 |
430 |
```