This is page 31 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.crawlers.base.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>mcp_server_webcrawl.crawlers.base package — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="next" title="mcp_server_webcrawl.crawlers.archivebox package" href="mcp_server_webcrawl.crawlers.archivebox.html" />
23 | <link rel="prev" title="mcp_server_webcrawl.crawlers package" href="mcp_server_webcrawl.crawlers.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
50 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
51 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
52 | <li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
53 | <li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
54 | </ul>
55 | </li>
56 | </ul>
57 |
58 | </div>
59 | </div>
60 | </nav>
61 |
62 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
63 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
64 | <a href="index.html">mcp-server-webcrawl</a>
65 | </nav>
66 |
67 | <div class="wy-nav-content">
68 | <div class="rst-content">
69 | <div role="navigation" aria-label="Page navigation">
70 | <ul class="wy-breadcrumbs">
71 | <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
72 | <li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
73 | <li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
74 | <li class="breadcrumb-item"><a href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a></li>
75 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.base package</li>
76 | <li class="wy-breadcrumbs-aside">
77 | <a href="_sources/mcp_server_webcrawl.crawlers.base.rst.txt" rel="nofollow"> View page source</a>
78 | </li>
79 | </ul>
80 | <hr/>
81 | </div>
82 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
83 | <div itemprop="articleBody">
84 |
85 | <section id="mcp-server-webcrawl-crawlers-base-package">
86 | <h1>mcp_server_webcrawl.crawlers.base package<a class="headerlink" href="#mcp-server-webcrawl-crawlers-base-package" title="Link to this heading"></a></h1>
87 | <section id="submodules">
88 | <h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
89 | </section>
90 | <section id="module-mcp_server_webcrawl.crawlers.base.adapter">
91 | <span id="mcp-server-webcrawl-crawlers-base-adapter-module"></span><h2>mcp_server_webcrawl.crawlers.base.adapter module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.base.adapter" title="Link to this heading"></a></h2>
92 | <dl class="py class">
93 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus">
94 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">IndexStatus</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#IndexStatus"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus" title="Link to this definition"></a></dt>
95 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/enum.html#enum.Enum" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">Enum</span></code></a></p>
96 | <p>An enumeration.</p>
97 | <dl class="py attribute">
98 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.UNDEFINED">
99 | <span class="sig-name descname"><span class="pre">UNDEFINED</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">''</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.UNDEFINED" title="Link to this definition"></a></dt>
100 | <dd></dd></dl>
101 |
102 | <dl class="py attribute">
103 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.IDLE">
104 | <span class="sig-name descname"><span class="pre">IDLE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'idle'</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.IDLE" title="Link to this definition"></a></dt>
105 | <dd></dd></dl>
106 |
107 | <dl class="py attribute">
108 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.INDEXING">
109 | <span class="sig-name descname"><span class="pre">INDEXING</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'indexing'</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.INDEXING" title="Link to this definition"></a></dt>
110 | <dd></dd></dl>
111 |
112 | <dl class="py attribute">
113 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.PARTIAL">
114 | <span class="sig-name descname"><span class="pre">PARTIAL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'partial'</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.PARTIAL" title="Link to this definition"></a></dt>
115 | <dd></dd></dl>
116 |
117 | <dl class="py attribute">
118 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.COMPLETE">
119 | <span class="sig-name descname"><span class="pre">COMPLETE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'complete'</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.COMPLETE" title="Link to this definition"></a></dt>
120 | <dd></dd></dl>
121 |
122 | <dl class="py attribute">
123 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.REMOTE">
124 | <span class="sig-name descname"><span class="pre">REMOTE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'remote'</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.REMOTE" title="Link to this definition"></a></dt>
125 | <dd></dd></dl>
126 |
127 | <dl class="py attribute">
128 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.FAILED">
129 | <span class="sig-name descname"><span class="pre">FAILED</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'failed'</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus.FAILED" title="Link to this definition"></a></dt>
130 | <dd></dd></dl>
131 |
132 | </dd></dl>
133 |
134 | <dl class="py class">
135 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState">
136 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">IndexState</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#IndexState"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="Link to this definition"></a></dt>
137 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
138 | <p>Shared state between crawler and manager for indexing progress</p>
139 | <dl class="py attribute">
140 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.status">
141 | <span class="sig-name descname"><span class="pre">status</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus" title="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus"><span class="pre">IndexStatus</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">''</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.status" title="Link to this definition"></a></dt>
142 | <dd></dd></dl>
143 |
144 | <dl class="py attribute">
145 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.processed">
146 | <span class="sig-name descname"><span class="pre">processed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><span class="pre">int</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.processed" title="Link to this definition"></a></dt>
147 | <dd></dd></dl>
148 |
149 | <dl class="py attribute">
150 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.time_start">
151 | <span class="sig-name descname"><span class="pre">time_start</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><span class="pre">datetime</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.14)"><span class="pre">None</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.time_start" title="Link to this definition"></a></dt>
152 | <dd></dd></dl>
153 |
154 | <dl class="py attribute">
155 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.time_end">
156 | <span class="sig-name descname"><span class="pre">time_end</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><span class="pre">datetime</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.14)"><span class="pre">None</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.time_end" title="Link to this definition"></a></dt>
157 | <dd></dd></dl>
158 |
159 | <dl class="py method">
160 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.set_status">
161 | <span class="sig-name descname"><span class="pre">set_status</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">status</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#IndexState.set_status"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.set_status" title="Link to this definition"></a></dt>
162 | <dd><dl class="field-list simple">
163 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
164 | <dd class="field-odd"><p><strong>status</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus" title="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus"><em>IndexStatus</em></a>) – </p>
165 | </dd>
166 | <dt class="field-even">Return type<span class="colon">:</span></dt>
167 | <dd class="field-even"><p>None</p>
168 | </dd>
169 | </dl>
170 | </dd></dl>
171 |
172 | <dl class="py method">
173 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.increment_processed">
174 | <span class="sig-name descname"><span class="pre">increment_processed</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#IndexState.increment_processed"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.increment_processed" title="Link to this definition"></a></dt>
175 | <dd></dd></dl>
176 |
177 | <dl class="py property">
178 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.duration">
179 | <em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">duration</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><span class="pre">str</span></a></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.duration" title="Link to this definition"></a></dt>
180 | <dd></dd></dl>
181 |
182 | <dl class="py method">
183 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.is_timeout">
184 | <span class="sig-name descname"><span class="pre">is_timeout</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#IndexState.is_timeout"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.is_timeout" title="Link to this definition"></a></dt>
185 | <dd><p>Check if the indexing operation has exceeded the timeout threshold</p>
186 | <dl class="field-list simple">
187 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
188 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.14)">bool</a></p>
189 | </dd>
190 | </dl>
191 | </dd></dl>
192 |
193 | <dl class="py method">
194 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.to_dict">
195 | <span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#IndexState.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.to_dict" title="Link to this definition"></a></dt>
196 | <dd><p>Convert the IndexState to a dictionary representation</p>
197 | <dl class="field-list simple">
198 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
199 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)">dict</a></p>
200 | </dd>
201 | </dl>
202 | </dd></dl>
203 |
204 | <dl class="py method">
205 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.IndexState.__init__">
206 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">status</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">IndexStatus.UNDEFINED</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">processed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">time_start</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">time_end</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState.__init__" title="Link to this definition"></a></dt>
207 | <dd><dl class="field-list simple">
208 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
209 | <dd class="field-odd"><ul class="simple">
210 | <li><p><strong>status</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexStatus" title="mcp_server_webcrawl.crawlers.base.adapter.IndexStatus"><em>IndexStatus</em></a>) – </p></li>
211 | <li><p><strong>processed</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
212 | <li><p><strong>time_start</strong> (<a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a><em> | </em><em>None</em>) – </p></li>
213 | <li><p><strong>time_end</strong> (<a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a><em> | </em><em>None</em>) – </p></li>
214 | </ul>
215 | </dd>
216 | <dt class="field-even">Return type<span class="colon">:</span></dt>
217 | <dd class="field-even"><p>None</p>
218 | </dd>
219 | </dl>
220 | </dd></dl>
221 |
222 | </dd></dl>
223 |
224 | <dl class="py class">
225 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.SitesGroup">
226 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">SitesGroup</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#SitesGroup"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesGroup" title="Link to this definition"></a></dt>
227 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
228 | <p>Container class supports the searching of one or more sites at once.</p>
229 | <dl class="field-list simple">
230 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
231 | <dd class="field-odd"><ul class="simple">
232 | <li><p><strong>datasrc</strong> – site datasrc</p></li>
233 | <li><p><strong>site_ids</strong> – site ids of the sites</p></li>
234 | <li><p><strong>site_paths</strong> – paths to site contents (directories)</p></li>
235 | </ul>
236 | </dd>
237 | </dl>
238 | <dl class="py method">
239 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.SitesGroup.__init__">
240 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_ids</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_paths</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#SitesGroup.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesGroup.__init__" title="Link to this definition"></a></dt>
241 | <dd><p>Container class supports the searching of one or more sites at once.</p>
242 | <dl class="field-list simple">
243 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
244 | <dd class="field-odd"><ul class="simple">
245 | <li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – site datasrc</p></li>
246 | <li><p><strong>site_ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>]</em>) – site ids of the sites</p></li>
247 | <li><p><strong>site_paths</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a><em>]</em>) – paths to site contents (directories)</p></li>
248 | </ul>
249 | </dd>
250 | <dt class="field-even">Return type<span class="colon">:</span></dt>
251 | <dd class="field-even"><p>None</p>
252 | </dd>
253 | </dl>
254 | </dd></dl>
255 |
256 | <dl class="py method">
257 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.SitesGroup.get_sites">
258 | <span class="sig-name descname"><span class="pre">get_sites</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#SitesGroup.get_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesGroup.get_sites" title="Link to this definition"></a></dt>
259 | <dd><dl class="field-list simple">
260 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
261 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)">dict</a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a>, <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a>]</p>
262 | </dd>
263 | </dl>
264 | </dd></dl>
265 |
266 | </dd></dl>
267 |
268 | <dl class="py class">
269 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.SitesStat">
270 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">SitesStat</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#SitesStat"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesStat" title="Link to this definition"></a></dt>
271 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
272 | <p>Some basic bookeeping, for troubleshooting</p>
273 | <dl class="field-list simple">
274 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
275 | <dd class="field-odd"><ul class="simple">
276 | <li><p><strong>group</strong> – SitesGroup to track statistics for</p></li>
277 | <li><p><strong>cached</strong> – whether the group was retrieved from cache</p></li>
278 | </ul>
279 | </dd>
280 | </dl>
281 | <dl class="py method">
282 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.SitesStat.__init__">
283 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cached</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#SitesStat.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesStat.__init__" title="Link to this definition"></a></dt>
284 | <dd><p>Some basic bookeeping, for troubleshooting</p>
285 | <dl class="field-list simple">
286 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
287 | <dd class="field-odd"><ul class="simple">
288 | <li><p><strong>group</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesGroup" title="mcp_server_webcrawl.crawlers.base.adapter.SitesGroup"><em>SitesGroup</em></a>) – SitesGroup to track statistics for</p></li>
289 | <li><p><strong>cached</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.14)"><em>bool</em></a>) – whether the group was retrieved from cache</p></li>
290 | </ul>
291 | </dd>
292 | <dt class="field-even">Return type<span class="colon">:</span></dt>
293 | <dd class="field-even"><p>None</p>
294 | </dd>
295 | </dl>
296 | </dd></dl>
297 |
298 | </dd></dl>
299 |
300 | <dl class="py class">
301 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager">
302 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">BaseManager</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager" title="Link to this definition"></a></dt>
303 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
304 | <p>Base class for managing web crawler data in in-memory SQLite databases.
305 | Provides connection pooling and caching for efficient access.</p>
306 | <p>Initialize the manager with statistics.</p>
307 | <dl class="py method">
308 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.__init__">
309 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.__init__" title="Link to this definition"></a></dt>
310 | <dd><p>Initialize the manager with statistics.</p>
311 | <dl class="field-list simple">
312 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
313 | <dd class="field-odd"><p>None</p>
314 | </dd>
315 | </dl>
316 | </dd></dl>
317 |
318 | <dl class="py method">
319 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.string_to_id">
320 | <em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">string_to_id</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">value</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.string_to_id"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.string_to_id" title="Link to this definition"></a></dt>
321 | <dd><p>Convert a string, such as a directory name, to a numeric ID
322 | suitable for a database primary key.</p>
323 | <p>Hash space and collision probability notes:
324 | - [:8] = 32 bits (4.29 billion values) - ~1% collision chance with 10,000 items
325 | - [:12] = 48 bits (280 trillion values) - ~0.0000001% collision chance with 10,000 items
326 | - [:16] = 64 bits (max safe SQLite INTEGER) - near-zero collision, 9.22 quintillion values
327 | - SQLite INTEGER type is 64-bit signed, with max value of 9,223,372,036,854,775,807.
328 | - The big problem with larger hashspaces is the length of the ids they generate for presentation.</p>
329 | <dl class="field-list simple">
330 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
331 | <dd class="field-odd"><p><strong>value</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – Input string to convert to an ID</p>
332 | </dd>
333 | <dt class="field-even">Returns<span class="colon">:</span></dt>
334 | <dd class="field-even"><p>Integer ID derived from the input string</p>
335 | </dd>
336 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
337 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a></p>
338 | </dd>
339 | </dl>
340 | </dd></dl>
341 |
342 | <dl class="py method">
343 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.get_basic_headers">
344 | <em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">get_basic_headers</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">file_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">resource_type</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.get_basic_headers"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.get_basic_headers" title="Link to this definition"></a></dt>
345 | <dd><p>Generate basic HTTP headers for a resource.</p>
346 | <dl class="field-list simple">
347 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
348 | <dd class="field-odd"><ul class="simple">
349 | <li><p><strong>file_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – size of the file in bytes</p></li>
350 | <li><p><strong>resource_type</strong> (<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResultType" title="mcp_server_webcrawl.models.resources.ResourceResultType"><em>ResourceResultType</em></a>) – type of resource to generate headers for</p></li>
351 | <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – file path used for MIME type detection</p></li>
352 | </ul>
353 | </dd>
354 | <dt class="field-even">Returns<span class="colon">:</span></dt>
355 | <dd class="field-even"><p>HTTP headers string with content type and length</p>
356 | </dd>
357 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
358 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
359 | </dd>
360 | </dl>
361 | </dd></dl>
362 |
363 | <dl class="py method">
364 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.read_files">
365 | <em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">read_files</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">paths</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.read_files"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.read_files" title="Link to this definition"></a></dt>
366 | <dd><p>Read content from multiple files concurrently.</p>
367 | <dl class="field-list simple">
368 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
369 | <dd class="field-odd"><p><strong>paths</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a><em>]</em>) – list of file paths to read</p>
370 | </dd>
371 | <dt class="field-even">Returns<span class="colon">:</span></dt>
372 | <dd class="field-even"><p>dictionary mapping file paths to their content or None for binary/unreadable files</p>
373 | </dd>
374 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
375 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)">dict</a>[<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>, <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a> | None]</p>
376 | </dd>
377 | </dl>
378 | </dd></dl>
379 |
380 | <dl class="py method">
381 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.read_file_contents">
382 | <em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">read_file_contents</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">file_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">resource_type</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.read_file_contents"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.read_file_contents" title="Link to this definition"></a></dt>
383 | <dd><p>Read content from text files with better error handling and encoding detection.</p>
384 | <dl class="field-list simple">
385 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
386 | <dd class="field-odd"><ul class="simple">
387 | <li><p><strong>file_path</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the file to read</p></li>
388 | <li><p><strong>resource_type</strong> (<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResultType" title="mcp_server_webcrawl.models.resources.ResourceResultType"><em>ResourceResultType</em></a>) – type of resource to determine if content should be read</p></li>
389 | </ul>
390 | </dd>
391 | <dt class="field-even">Returns<span class="colon">:</span></dt>
392 | <dd class="field-even"><p>file content as string or None for binary/unreadable files</p>
393 | </dd>
394 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
395 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a> | None</p>
396 | </dd>
397 | </dl>
398 | </dd></dl>
399 |
400 | <dl class="py method">
401 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.decruft_path">
402 | <em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">decruft_path</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.decruft_path"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.decruft_path" title="Link to this definition"></a></dt>
403 | <dd><p>Very light touch cleanup of file naming, these tmps are creating noise
404 | and extensions are useful in classifying resources</p>
405 | <dl class="field-list simple">
406 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
407 | <dd class="field-odd"><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – file path string to clean up</p>
408 | </dd>
409 | <dt class="field-even">Returns<span class="colon">:</span></dt>
410 | <dd class="field-even"><p>cleaned path string with temp files and weird extensions normalized</p>
411 | </dd>
412 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
413 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
414 | </dd>
415 | </dl>
416 | </dd></dl>
417 |
418 | <dl class="py method">
419 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.get_stats">
420 | <span class="sig-name descname"><span class="pre">get_stats</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.get_stats"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.get_stats" title="Link to this definition"></a></dt>
421 | <dd><dl class="field-list simple">
422 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
423 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesStat" title="mcp_server_webcrawl.crawlers.base.adapter.SitesStat"><em>SitesStat</em></a>]</p>
424 | </dd>
425 | </dl>
426 | </dd></dl>
427 |
428 | <dl class="py method">
429 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.adapter.BaseManager.get_resources_for_sites_group">
430 | <span class="sig-name descname"><span class="pre">get_resources_for_sites_group</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sites_group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">query</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">limit</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">swap_values</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">{}</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/adapter.html#BaseManager.get_resources_for_sites_group"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager.get_resources_for_sites_group" title="Link to this definition"></a></dt>
431 | <dd><p>Get resources from directories using structured query parsing with SearchQueryParser.</p>
432 | <p>This method extracts types, fields, and statuses from the querystring instead of
433 | accepting them as separate arguments, using the new SearchSubquery functionality.</p>
434 | <dl class="field-list simple">
435 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
436 | <dd class="field-odd"><ul class="simple">
437 | <li><p><strong>sites_group</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesGroup" title="mcp_server_webcrawl.crawlers.base.adapter.SitesGroup"><em>SitesGroup</em></a>) – Group of sites to search in</p></li>
438 | <li><p><strong>query</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – Search query string that can include field:value syntax for filtering</p></li>
439 | <li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – resource fields to be returned by the API (Content, Headers, etc.)</p></li>
440 | <li><p><strong>sort</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em> | </em><em>None</em>) – Sort order for results</p></li>
441 | <li><p><strong>limit</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – Maximum number of results to return</p></li>
442 | <li><p><strong>offset</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – Number of results to skip for pagination</p></li>
443 | <li><p><strong>swap_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)"><em>dict</em></a>) – per-field parameterized values to check for (and replace)</p></li>
444 | </ul>
445 | </dd>
446 | <dt class="field-even">Returns<span class="colon">:</span></dt>
447 | <dd class="field-even"><p>Tuple of (list of ResourceResult objects, total count, connection_index_state)</p>
448 | </dd>
449 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
450 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.14)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a>], <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a>, <a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="mcp_server_webcrawl.crawlers.base.adapter.IndexState"><em>IndexState</em></a>]</p>
451 | </dd>
452 | </dl>
453 | <p class="rubric">Notes</p>
454 | <p>Returns empty results if sites is empty or not provided.
455 | If the database is being built, it will log a message and return empty results.</p>
456 | <p>This method extracts field-specific filters from the query string using SearchQueryParser:
457 | - type:html (to filter by resource type)
458 | - status:200 (to filter by HTTP status)
459 | Any fields present in the SearchSubquery will be included in the response.</p>
460 | </dd></dl>
461 |
462 | </dd></dl>
463 |
464 | </section>
465 | <section id="module-mcp_server_webcrawl.crawlers.base.api">
466 | <span id="mcp-server-webcrawl-crawlers-base-api-module"></span><h2>mcp_server_webcrawl.crawlers.base.api module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.base.api" title="Link to this heading"></a></h2>
467 | <dl class="py class">
468 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApiEncoder">
469 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">BaseJsonApiEncoder</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApiEncoder"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApiEncoder" title="Link to this definition"></a></dt>
470 | <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">JSONEncoder</span></code></p>
471 | <p>Custom JSON encoder for BaseJsonApi objects and ResourceResultType enums.</p>
472 | <p>Constructor for JSONEncoder, with sensible defaults.</p>
473 | <p>If skipkeys is false, then it is a TypeError to attempt
474 | encoding of keys that are not str, int, float or None. If
475 | skipkeys is True, such items are simply skipped.</p>
476 | <p>If ensure_ascii is true, the output is guaranteed to be str
477 | objects with all incoming non-ASCII characters escaped. If
478 | ensure_ascii is false, the output can contain non-ASCII characters.</p>
479 | <p>If check_circular is true, then lists, dicts, and custom encoded
480 | objects will be checked for circular references during encoding to
481 | prevent an infinite recursion (which would cause an OverflowError).
482 | Otherwise, no such check takes place.</p>
483 | <p>If allow_nan is true, then NaN, Infinity, and -Infinity will be
484 | encoded as such. This behavior is not JSON specification compliant,
485 | but is consistent with most JavaScript based encoders and decoders.
486 | Otherwise, it will be a ValueError to encode such floats.</p>
487 | <p>If sort_keys is true, then the output of dictionaries will be
488 | sorted by key; this is useful for regression tests to ensure
489 | that JSON serializations can be compared on a day-to-day basis.</p>
490 | <p>If indent is a non-negative integer, then JSON array
491 | elements and object members will be pretty-printed with that
492 | indent level. An indent level of 0 will only insert newlines.
493 | None is the most compact representation.</p>
494 | <p>If specified, separators should be an (item_separator, key_separator)
495 | tuple. The default is (’, ‘, ‘: ‘) if <em>indent</em> is <code class="docutils literal notranslate"><span class="pre">None</span></code> and
496 | (‘,’, ‘: ‘) otherwise. To get the most compact JSON representation,
497 | you should specify (‘,’, ‘:’) to eliminate whitespace.</p>
498 | <p>If specified, default is a function that gets called for objects
499 | that can’t otherwise be serialized. It should return a JSON encodable
500 | version of the object or raise a <code class="docutils literal notranslate"><span class="pre">TypeError</span></code>.</p>
501 | <dl class="py method">
502 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApiEncoder.default">
503 | <span class="sig-name descname"><span class="pre">default</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">obj</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApiEncoder.default"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApiEncoder.default" title="Link to this definition"></a></dt>
504 | <dd><p>Override default encoder to handle custom types.</p>
505 | <dl class="field-list simple">
506 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
507 | <dd class="field-odd"><p><strong>obj</strong> – Object to encode</p>
508 | </dd>
509 | <dt class="field-even">Returns<span class="colon">:</span></dt>
510 | <dd class="field-even"><p>JSON serializable representation of the object</p>
511 | </dd>
512 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
513 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Any" title="(in Python v3.14)"><em>Any</em></a></p>
514 | </dd>
515 | </dl>
516 | </dd></dl>
517 |
518 | </dd></dl>
519 |
520 | <dl class="py class">
521 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi">
522 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">BaseJsonApi</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi" title="Link to this definition"></a></dt>
523 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
524 | <p>Base class for JSON API responses.</p>
525 | <p>Provides a standardized structure for API responses including metadata,
526 | results, and error handling.</p>
527 | <p>Construct with the arguments of creation (aoc), these will be echoed back in
528 | JSON response. This is an object that collapses into json on json dumps. This is
529 | done with everything within implementing to_dict.</p>
530 | <dl class="field-list simple">
531 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
532 | <dd class="field-odd"><ul class="simple">
533 | <li><p><strong>method</strong> – API method name</p></li>
534 | <li><p><strong>args</strong> – Dictionary of API arguments</p></li>
535 | <li><p><strong>index_state</strong> – indexing, complete, remote, etc.</p></li>
536 | </ul>
537 | </dd>
538 | </dl>
539 | <dl class="py method">
540 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.__init__">
541 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">index_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.__init__" title="Link to this definition"></a></dt>
542 | <dd><p>Construct with the arguments of creation (aoc), these will be echoed back in
543 | JSON response. This is an object that collapses into json on json dumps. This is
544 | done with everything within implementing to_dict.</p>
545 | <dl class="field-list simple">
546 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
547 | <dd class="field-odd"><ul class="simple">
548 | <li><p><strong>method</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – API method name</p></li>
549 | <li><p><strong>args</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)"><em>dict</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Any" title="(in Python v3.14)"><em>Any</em></a><em>]</em>) – Dictionary of API arguments</p></li>
550 | <li><p><strong>index_state</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="mcp_server_webcrawl.crawlers.base.adapter.IndexState"><em>IndexState</em></a><em> | </em><em>None</em>) – indexing, complete, remote, etc.</p></li>
551 | </ul>
552 | </dd>
553 | </dl>
554 | </dd></dl>
555 |
556 | <dl class="py property">
557 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.total">
558 | <em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">total</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><span class="pre">int</span></a></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.total" title="Link to this definition"></a></dt>
559 | <dd><p>Returns the total number of results.</p>
560 | <dl class="field-list simple">
561 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
562 | <dd class="field-odd"><p>Integer count of total results</p>
563 | </dd>
564 | </dl>
565 | </dd></dl>
566 |
567 | <dl class="py method">
568 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.get_results">
569 | <span class="sig-name descname"><span class="pre">get_results</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi.get_results"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.get_results" title="Link to this definition"></a></dt>
570 | <dd><p>Returns list of results.</p>
571 | <dl class="field-list simple">
572 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
573 | <dd class="field-odd"><p>Results of type SiteResult or ResourceResult</p>
574 | </dd>
575 | <dt class="field-even">Return type<span class="colon">:</span></dt>
576 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a> | <a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a>]</p>
577 | </dd>
578 | </dl>
579 | </dd></dl>
580 |
581 | <dl class="py method">
582 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.set_results">
583 | <span class="sig-name descname"><span class="pre">set_results</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">total</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">limit</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi.set_results"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.set_results" title="Link to this definition"></a></dt>
584 | <dd><p>Set the results of the API response.</p>
585 | <dl class="field-list simple">
586 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
587 | <dd class="field-odd"><ul class="simple">
588 | <li><p><strong>results</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a><em> | </em><a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a><em>]</em>) – List of result objects</p></li>
589 | <li><p><strong>total</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – Total number of results (including those beyond limit)</p></li>
590 | <li><p><strong>offset</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – Starting position in the full result set</p></li>
591 | <li><p><strong>limit</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – Maximum number of results to include</p></li>
592 | </ul>
593 | </dd>
594 | <dt class="field-even">Return type<span class="colon">:</span></dt>
595 | <dd class="field-even"><p>None</p>
596 | </dd>
597 | </dl>
598 | </dd></dl>
599 |
600 | <dl class="py method">
601 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.append_error">
602 | <span class="sig-name descname"><span class="pre">append_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">message</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi.append_error"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.append_error" title="Link to this definition"></a></dt>
603 | <dd><p>Add an error to the JSON response, visible to the endpoint LLM.</p>
604 | <dl class="field-list simple">
605 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
606 | <dd class="field-odd"><p><strong>message</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – Error message to add</p>
607 | </dd>
608 | <dt class="field-even">Return type<span class="colon">:</span></dt>
609 | <dd class="field-even"><p>None</p>
610 | </dd>
611 | </dl>
612 | </dd></dl>
613 |
614 | <dl class="py method">
615 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.to_dict">
616 | <span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.to_dict" title="Link to this definition"></a></dt>
617 | <dd><p>Convert the object to a JSON-serializable dictionary.</p>
618 | <dl class="field-list simple">
619 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
620 | <dd class="field-odd"><p>Dictionary representation of the API response</p>
621 | </dd>
622 | <dt class="field-even">Return type<span class="colon">:</span></dt>
623 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)">dict</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a>, <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a> | <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a> | <a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.14)">float</a> | <a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.14)">bool</a> | <a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a> | <a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a> | <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)">dict</a> | <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a> | None]</p>
624 | </dd>
625 | </dl>
626 | </dd></dl>
627 |
628 | <dl class="py method">
629 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.to_json">
630 | <span class="sig-name descname"><span class="pre">to_json</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/api.html#BaseJsonApi.to_json"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi.to_json" title="Link to this definition"></a></dt>
631 | <dd><p>Return a JSON serializable representation of this object.</p>
632 | <dl class="field-list simple">
633 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
634 | <dd class="field-odd"><p>JSON string representation of the API response</p>
635 | </dd>
636 | <dt class="field-even">Return type<span class="colon">:</span></dt>
637 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
638 | </dd>
639 | </dl>
640 | </dd></dl>
641 |
642 | </dd></dl>
643 |
644 | </section>
645 | <section id="module-mcp_server_webcrawl.crawlers.base.crawler">
646 | <span id="mcp-server-webcrawl-crawlers-base-crawler-module"></span><h2>mcp_server_webcrawl.crawlers.base.crawler module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.base.crawler" title="Link to this heading"></a></h2>
647 | <dl class="py class">
648 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler">
649 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">BaseCrawler</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="Link to this definition"></a></dt>
650 | <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.14)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
651 | <p>Base crawler class that implements MCP server functionality.</p>
652 | <p>This class provides the foundation for specialized crawlers to interact with
653 | the MCP server and handle tool operations for web resources.</p>
654 | <p>Initialize the BaseCrawler with a data source path and required adapter functions.</p>
655 | <dl class="field-list simple">
656 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
657 | <dd class="field-odd"><ul class="simple">
658 | <li><p><strong>datasrc</strong> – path to the data source</p></li>
659 | <li><p><strong>get_sites_func</strong> – function to retrieve sites from the data source</p></li>
660 | <li><p><strong>get_resources_func</strong> – function to retrieve resources from the data source</p></li>
661 | <li><p><strong>resource_field_mapping</strong> – mapping of resource field names to display names</p></li>
662 | </ul>
663 | </dd>
664 | </dl>
665 | <dl class="py method">
666 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.__init__">
667 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_sites_func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_resources_func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">resource_field_mapping</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">{'content':</span> <span class="pre">'ResourcesFullText.Content',</span> <span class="pre">'created':</span> <span class="pre">'Resources.Created',</span> <span class="pre">'fulltext':</span> <span class="pre">'ResourcesFullText',</span> <span class="pre">'headers':</span> <span class="pre">'ResourcesFullText.Headers',</span> <span class="pre">'id':</span> <span class="pre">'ResourcesFullText.Id',</span> <span class="pre">'modified':</span> <span class="pre">'Resources.Modified',</span> <span class="pre">'site':</span> <span class="pre">'ResourcesFullText.Project',</span> <span class="pre">'size':</span> <span class="pre">'Resources.Size',</span> <span class="pre">'status':</span> <span class="pre">'Resources.Status',</span> <span class="pre">'time':</span> <span class="pre">'Resources.Time',</span> <span class="pre">'type':</span> <span class="pre">'ResourcesFullText.Type',</span> <span class="pre">'url':</span> <span class="pre">'ResourcesFullText.Url'}</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.__init__" title="Link to this definition"></a></dt>
668 | <dd><p>Initialize the BaseCrawler with a data source path and required adapter functions.</p>
669 | <dl class="field-list simple">
670 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
671 | <dd class="field-odd"><ul class="simple">
672 | <li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the data source</p></li>
673 | <li><p><strong>get_sites_func</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Callable" title="(in Python v3.14)"><em>Callable</em></a>) – function to retrieve sites from the data source</p></li>
674 | <li><p><strong>get_resources_func</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Callable" title="(in Python v3.14)"><em>Callable</em></a>) – function to retrieve resources from the data source</p></li>
675 | <li><p><strong>resource_field_mapping</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)"><em>dict</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>]</em>) – mapping of resource field names to display names</p></li>
676 | </ul>
677 | </dd>
678 | <dt class="field-even">Return type<span class="colon">:</span></dt>
679 | <dd class="field-even"><p>None</p>
680 | </dd>
681 | </dl>
682 | </dd></dl>
683 |
684 | <dl class="py property">
685 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.datasrc">
686 | <em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">datasrc</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><span class="pre">Path</span></a></em><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.datasrc" title="Link to this definition"></a></dt>
687 | <dd></dd></dl>
688 |
689 | <dl class="py method">
690 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_list_prompts">
691 | <em class="property"><span class="pre">async</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mcp_list_prompts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.mcp_list_prompts"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_list_prompts" title="Link to this definition"></a></dt>
692 | <dd><p>List available prompts (currently none).</p>
693 | <dl class="field-list simple">
694 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
695 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a></p>
696 | </dd>
697 | </dl>
698 | </dd></dl>
699 |
700 | <dl class="py method">
701 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_list_resources">
702 | <em class="property"><span class="pre">async</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mcp_list_resources</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.mcp_list_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_list_resources" title="Link to this definition"></a></dt>
703 | <dd><p>List available resources (currently none).</p>
704 | <dl class="field-list simple">
705 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
706 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a></p>
707 | </dd>
708 | </dl>
709 | </dd></dl>
710 |
711 | <dl class="py method">
712 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.serve">
713 | <em class="property"><span class="pre">async</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">serve</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">stdin</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stdout</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.serve"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.serve" title="Link to this definition"></a></dt>
714 | <dd><p>Launch the awaitable server.</p>
715 | <dl class="field-list simple">
716 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
717 | <dd class="field-odd"><ul class="simple">
718 | <li><p><strong>stdin</strong> (<em>AsyncFile</em><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – input stream for the server</p></li>
719 | <li><p><strong>stdout</strong> (<em>AsyncFile</em><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – output stream for the server</p></li>
720 | </ul>
721 | </dd>
722 | <dt class="field-even">Returns<span class="colon">:</span></dt>
723 | <dd class="field-even"><p>The MCP server over stdio</p>
724 | </dd>
725 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
726 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)">dict</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a>, <a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Any" title="(in Python v3.14)"><em>Any</em></a>]</p>
727 | </dd>
728 | </dl>
729 | </dd></dl>
730 |
731 | <dl class="py method">
732 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_initialization_options">
733 | <span class="sig-name descname"><span class="pre">get_initialization_options</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.get_initialization_options"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_initialization_options" title="Link to this definition"></a></dt>
734 | <dd><p>Get the MCP initialization object.</p>
735 | <dl class="field-list simple">
736 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
737 | <dd class="field-odd"><p>Dictionary containing project information</p>
738 | </dd>
739 | <dt class="field-even">Return type<span class="colon">:</span></dt>
740 | <dd class="field-even"><p><em>InitializationOptions</em></p>
741 | </dd>
742 | </dl>
743 | </dd></dl>
744 |
745 | <dl class="py method">
746 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_sites_api_json">
747 | <span class="sig-name descname"><span class="pre">get_sites_api_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.get_sites_api_json"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_sites_api_json" title="Link to this definition"></a></dt>
748 | <dd><p>Get sites API result as JSON.</p>
749 | <dl class="field-list simple">
750 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
751 | <dd class="field-odd"><p>JSON string of sites API results</p>
752 | </dd>
753 | <dt class="field-even">Return type<span class="colon">:</span></dt>
754 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
755 | </dd>
756 | </dl>
757 | </dd></dl>
758 |
759 | <dl class="py method">
760 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_resources_api_json">
761 | <span class="sig-name descname"><span class="pre">get_resources_api_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.get_resources_api_json"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_resources_api_json" title="Link to this definition"></a></dt>
762 | <dd><p>Get resources API result as JSON.</p>
763 | <dl class="field-list simple">
764 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
765 | <dd class="field-odd"><p>JSON string of resources API results</p>
766 | </dd>
767 | <dt class="field-even">Return type<span class="colon">:</span></dt>
768 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
769 | </dd>
770 | </dl>
771 | </dd></dl>
772 |
773 | <dl class="py method">
774 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_sites_api">
775 | <span class="sig-name descname"><span class="pre">get_sites_api</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.get_sites_api"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_sites_api" title="Link to this definition"></a></dt>
776 | <dd><dl class="field-list simple">
777 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
778 | <dd class="field-odd"><ul class="simple">
779 | <li><p><strong>ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
780 | <li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
781 | </ul>
782 | </dd>
783 | <dt class="field-even">Return type<span class="colon">:</span></dt>
784 | <dd class="field-even"><p><a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi" title="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi"><em>BaseJsonApi</em></a></p>
785 | </dd>
786 | </dl>
787 | </dd></dl>
788 |
789 | <dl class="py method">
790 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_resources_api">
791 | <span class="sig-name descname"><span class="pre">get_resources_api</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sites</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">limit</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">extras</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">extrasRegex</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">extrasXpath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.get_resources_api"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_resources_api" title="Link to this definition"></a></dt>
792 | <dd><dl class="field-list simple">
793 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
794 | <dd class="field-odd"><ul class="simple">
795 | <li><p><strong>sites</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
796 | <li><p><strong>query</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p></li>
797 | <li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
798 | <li><p><strong>sort</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em> | </em><em>None</em>) – </p></li>
799 | <li><p><strong>limit</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
800 | <li><p><strong>offset</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
801 | <li><p><strong>extras</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
802 | <li><p><strong>extrasRegex</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
803 | <li><p><strong>extrasXpath</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – </p></li>
804 | </ul>
805 | </dd>
806 | <dt class="field-even">Return type<span class="colon">:</span></dt>
807 | <dd class="field-even"><p><a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.api.BaseJsonApi" title="mcp_server_webcrawl.crawlers.base.api.BaseJsonApi"><em>BaseJsonApi</em></a></p>
808 | </dd>
809 | </dl>
810 | </dd></dl>
811 |
812 | <dl class="py method">
813 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_list_tools">
814 | <em class="property"><span class="pre">async</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mcp_list_tools</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.mcp_list_tools"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_list_tools" title="Link to this definition"></a></dt>
815 | <dd><p>List available tools.</p>
816 | <dl class="field-list simple">
817 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
818 | <dd class="field-odd"><p>List of available tools</p>
819 | </dd>
820 | <dt class="field-even">Raises<span class="colon">:</span></dt>
821 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#NotImplementedError" title="(in Python v3.14)"><strong>NotImplementedError</strong></a> – This method must be implemented by subclasses</p>
822 | </dd>
823 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
824 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<em>Tool</em>]</p>
825 | </dd>
826 | </dl>
827 | </dd></dl>
828 |
829 | <dl class="py method">
830 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_call_tool">
831 | <em class="property"><span class="pre">async</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mcp_call_tool</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.mcp_call_tool"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.mcp_call_tool" title="Link to this definition"></a></dt>
832 | <dd><p>Handle tool execution requests. You can override this or super(), then tweak.
833 | Basically, it is a passthrough.</p>
834 | <dl class="field-list simple">
835 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
836 | <dd class="field-odd"><ul class="simple">
837 | <li><p><strong>name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – name of the tool to call</p></li>
838 | <li><p><strong>arguments</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)"><em>dict</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Any" title="(in Python v3.14)"><em>Any</em></a><em>] </em><em>| </em><em>None</em>) – arguments to pass to the tool</p></li>
839 | </ul>
840 | </dd>
841 | <dt class="field-even">Returns<span class="colon">:</span></dt>
842 | <dd class="field-even"><p>List of content objects resulting from the tool execution</p>
843 | </dd>
844 | <dt class="field-odd">Raises<span class="colon">:</span></dt>
845 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#ValueError" title="(in Python v3.14)"><strong>ValueError</strong></a> – If the specified tool does not exist</p>
846 | </dd>
847 | <dt class="field-even">Return type<span class="colon">:</span></dt>
848 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<em>TextContent</em> | <em>ImageContent</em> | <em>EmbeddedResource</em>]</p>
849 | </dd>
850 | </dl>
851 | </dd></dl>
852 |
853 | <dl class="py method">
854 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_thumbnails">
855 | <span class="sig-name descname"><span class="pre">get_thumbnails</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/crawler.html#BaseCrawler.get_thumbnails"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler.get_thumbnails" title="Link to this definition"></a></dt>
856 | <dd><dl class="field-list simple">
857 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
858 | <dd class="field-odd"><p><strong>results</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a><em>]</em>) – </p>
859 | </dd>
860 | <dt class="field-even">Return type<span class="colon">:</span></dt>
861 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<em>ImageContent</em>]</p>
862 | </dd>
863 | </dl>
864 | </dd></dl>
865 |
866 | </dd></dl>
867 |
868 | </section>
869 | <section id="module-mcp_server_webcrawl.crawlers.base.indexed">
870 | <span id="mcp-server-webcrawl-crawlers-base-indexed-module"></span><h2>mcp_server_webcrawl.crawlers.base.indexed module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.base.indexed" title="Link to this heading"></a></h2>
871 | <dl class="py class">
872 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager">
873 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">IndexedManager</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedManager"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager" title="Link to this definition"></a></dt>
874 | <dd><p>Bases: <a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.BaseManager" title="mcp_server_webcrawl.crawlers.base.adapter.BaseManager"><code class="xref py py-class docutils literal notranslate"><span class="pre">BaseManager</span></code></a></p>
875 | <p>Initialize the manager with statistics.</p>
876 | <dl class="py method">
877 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager.__init__">
878 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedManager.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager.__init__" title="Link to this definition"></a></dt>
879 | <dd><p>Initialize the manager with statistics.</p>
880 | </dd></dl>
881 |
882 | <dl class="py method">
883 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager.get_connection">
884 | <span class="sig-name descname"><span class="pre">get_connection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedManager.get_connection"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager.get_connection" title="Link to this definition"></a></dt>
885 | <dd><p>Get database connection for sites in the group, creating if needed.</p>
886 | <dl class="field-list simple">
887 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
888 | <dd class="field-odd"><p><strong>group</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.SitesGroup" title="mcp_server_webcrawl.crawlers.base.adapter.SitesGroup"><em>SitesGroup</em></a>) – group of sites to connect to</p>
889 | </dd>
890 | <dt class="field-even">Returns<span class="colon">:</span></dt>
891 | <dd class="field-even"><p><dl class="simple">
892 | <dt>Tuple of (SQLite connection to in-memory database with data loaded or None if building,</dt><dd><p>IndexState associated with this database)</p>
893 | </dd>
894 | </dl>
895 | </p>
896 | </dd>
897 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
898 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.14)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection" title="(in Python v3.14)"><em>Connection</em></a> | None, <a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="mcp_server_webcrawl.crawlers.base.adapter.IndexState"><em>IndexState</em></a>]</p>
899 | </dd>
900 | </dl>
901 | </dd></dl>
902 |
903 | <dl class="py method">
904 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager.get_sites_for_directories">
905 | <span class="sig-name descname"><span class="pre">get_sites_for_directories</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedManager.get_sites_for_directories"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager.get_sites_for_directories" title="Link to this definition"></a></dt>
906 | <dd><p>List site directories in the datasrc directory as sites.</p>
907 | <dl class="field-list simple">
908 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
909 | <dd class="field-odd"><ul class="simple">
910 | <li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the directory containing site subdirectories</p></li>
911 | <li><p><strong>ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
912 | <li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – optional list of fields to include in the response</p></li>
913 | </ul>
914 | </dd>
915 | <dt class="field-even">Returns<span class="colon">:</span></dt>
916 | <dd class="field-even"><p>List of SiteResult objects, one for each site directory</p>
917 | </dd>
918 | <dt class="field-odd">Return type<span class="colon">:</span></dt>
919 | <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a>]</p>
920 | </dd>
921 | </dl>
922 | <p class="rubric">Notes</p>
923 | <p>Returns an empty list if the datasrc directory doesn’t exist.</p>
924 | </dd></dl>
925 |
926 | </dd></dl>
927 |
928 | <dl class="py class">
929 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler">
930 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">IndexedCrawler</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedCrawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler" title="Link to this definition"></a></dt>
931 | <dd><p>Bases: <a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><code class="xref py py-class docutils literal notranslate"><span class="pre">BaseCrawler</span></code></a></p>
932 | <p>A crawler implementation for data sources that load into an in-memory sqlite.
933 | Shares commonality between specialized crawlers.</p>
934 | <p>Initialize the IndexedCrawler with a data source path and required adapter functions.</p>
935 | <dl class="field-list simple">
936 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
937 | <dd class="field-odd"><ul class="simple">
938 | <li><p><strong>datasrc</strong> – path to the data source</p></li>
939 | <li><p><strong>get_sites_func</strong> – function to retrieve sites from the data source</p></li>
940 | <li><p><strong>get_resources_func</strong> – function to retrieve resources from the data source</p></li>
941 | <li><p><strong>resource_field_mapping</strong> – mapping of resource field names to display names</p></li>
942 | </ul>
943 | </dd>
944 | </dl>
945 | <dl class="py method">
946 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler.__init__">
947 | <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_sites_func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_resources_func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">resource_field_mapping</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">{'content':</span> <span class="pre">'ResourcesFullText.Content',</span> <span class="pre">'created':</span> <span class="pre">'Resources.Created',</span> <span class="pre">'fulltext':</span> <span class="pre">'ResourcesFullText',</span> <span class="pre">'headers':</span> <span class="pre">'ResourcesFullText.Headers',</span> <span class="pre">'id':</span> <span class="pre">'ResourcesFullText.Id',</span> <span class="pre">'modified':</span> <span class="pre">'Resources.Modified',</span> <span class="pre">'site':</span> <span class="pre">'ResourcesFullText.Project',</span> <span class="pre">'size':</span> <span class="pre">'Resources.Size',</span> <span class="pre">'status':</span> <span class="pre">'Resources.Status',</span> <span class="pre">'time':</span> <span class="pre">'Resources.Time',</span> <span class="pre">'type':</span> <span class="pre">'ResourcesFullText.Type',</span> <span class="pre">'url':</span> <span class="pre">'ResourcesFullText.Url'}</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedCrawler.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler.__init__" title="Link to this definition"></a></dt>
948 | <dd><p>Initialize the IndexedCrawler with a data source path and required adapter functions.</p>
949 | <dl class="field-list simple">
950 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
951 | <dd class="field-odd"><ul class="simple">
952 | <li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a>) – path to the data source</p></li>
953 | <li><p><strong>get_sites_func</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Callable" title="(in Python v3.14)"><em>Callable</em></a>) – function to retrieve sites from the data source</p></li>
954 | <li><p><strong>get_resources_func</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Callable" title="(in Python v3.14)"><em>Callable</em></a>) – function to retrieve resources from the data source</p></li>
955 | <li><p><strong>resource_field_mapping</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.14)"><em>dict</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em>]</em>) – mapping of resource field names to display names</p></li>
956 | </ul>
957 | </dd>
958 | <dt class="field-even">Return type<span class="colon">:</span></dt>
959 | <dd class="field-even"><p>None</p>
960 | </dd>
961 | </dl>
962 | </dd></dl>
963 |
964 | <dl class="py method">
965 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler.mcp_list_tools">
966 | <em class="property"><span class="pre">async</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mcp_list_tools</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/indexed.html#IndexedCrawler.mcp_list_tools"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler.mcp_list_tools" title="Link to this definition"></a></dt>
967 | <dd><p>List available tools for this crawler.</p>
968 | <dl class="field-list simple">
969 | <dt class="field-odd">Returns<span class="colon">:</span></dt>
970 | <dd class="field-odd"><p>List of Tool objects</p>
971 | </dd>
972 | <dt class="field-even">Return type<span class="colon">:</span></dt>
973 | <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)">list</a>[<em>Tool</em>]</p>
974 | </dd>
975 | </dl>
976 | </dd></dl>
977 |
978 | </dd></dl>
979 |
980 | </section>
981 | <section id="module-mcp_server_webcrawl.crawlers.base.tests">
982 | <span id="mcp-server-webcrawl-crawlers-base-tests-module"></span><h2>mcp_server_webcrawl.crawlers.base.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.base.tests" title="Link to this heading"></a></h2>
983 | <dl class="py class">
984 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests">
985 | <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">BaseCrawlerTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests" title="Link to this definition"></a></dt>
986 | <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">TestCase</span></code></p>
987 | <p>Create an instance of the class that will use the named test
988 | method when executed. Raises a ValueError if the instance does
989 | not have a method with the specified name.</p>
990 | <dl class="py method">
991 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.setUp">
992 | <span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.setUp" title="Link to this definition"></a></dt>
993 | <dd><p>Hook method for setting up the test fixture before exercising it.</p>
994 | </dd></dl>
995 |
996 | <dl class="py method">
997 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_search_tests">
998 | <span class="sig-name descname"><span class="pre">run_pragmar_search_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_id</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_search_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_search_tests" title="Link to this definition"></a></dt>
999 | <dd><p>Run a battery of database checks on the crawler and Boolean validation</p>
1000 | <dl class="field-list simple">
1001 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1002 | <dd class="field-odd"><ul class="simple">
1003 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1004 | <li><p><strong>site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1005 | </ul>
1006 | </dd>
1007 | </dl>
1008 | </dd></dl>
1009 |
1010 | <dl class="py method">
1011 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_image_tests">
1012 | <span class="sig-name descname"><span class="pre">run_pragmar_image_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pragmar_site_id</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_image_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_image_tests" title="Link to this definition"></a></dt>
1013 | <dd><p>Test InterroBot-specific image handling and thumbnails.</p>
1014 | <dl class="field-list simple">
1015 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1016 | <dd class="field-odd"><ul class="simple">
1017 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1018 | <li><p><strong>pragmar_site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1019 | </ul>
1020 | </dd>
1021 | </dl>
1022 | </dd></dl>
1023 |
1024 | <dl class="py method">
1025 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_sites_resources_tests">
1026 | <span class="sig-name descname"><span class="pre">run_sites_resources_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pragmar_site_id</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">example_site_id</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_sites_resources_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_sites_resources_tests" title="Link to this definition"></a></dt>
1027 | <dd><dl class="field-list simple">
1028 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1029 | <dd class="field-odd"><ul class="simple">
1030 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1031 | <li><p><strong>pragmar_site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1032 | <li><p><strong>example_site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1033 | </ul>
1034 | </dd>
1035 | </dl>
1036 | </dd></dl>
1037 |
1038 | <dl class="py method">
1039 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_tokenizer_tests">
1040 | <span class="sig-name descname"><span class="pre">run_pragmar_tokenizer_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_id</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_tokenizer_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_tokenizer_tests" title="Link to this definition"></a></dt>
1041 | <dd><p>fts hyphens and underscores are particularly challenging, thus
1042 | have a dedicated test. these must be configured in multiple places
1043 | including CREATE TABLE … tokenizer, as well as handled by the query
1044 | parser.</p>
1045 | <dl class="field-list simple">
1046 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1047 | <dd class="field-odd"><ul class="simple">
1048 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1049 | <li><p><strong>site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1050 | </ul>
1051 | </dd>
1052 | </dl>
1053 | </dd></dl>
1054 |
1055 | <dl class="py method">
1056 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_site_tests">
1057 | <span class="sig-name descname"><span class="pre">run_pragmar_site_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_id</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_site_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_site_tests" title="Link to this definition"></a></dt>
1058 | <dd><dl class="field-list simple">
1059 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1060 | <dd class="field-odd"><ul class="simple">
1061 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1062 | <li><p><strong>site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1063 | </ul>
1064 | </dd>
1065 | </dl>
1066 | </dd></dl>
1067 |
1068 | <dl class="py method">
1069 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_sort_tests">
1070 | <span class="sig-name descname"><span class="pre">run_pragmar_sort_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_id</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_sort_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_sort_tests" title="Link to this definition"></a></dt>
1071 | <dd><p>Test sorting functionality with performance optimizations.</p>
1072 | <dl class="field-list simple">
1073 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1074 | <dd class="field-odd"><ul class="simple">
1075 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1076 | <li><p><strong>site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1077 | </ul>
1078 | </dd>
1079 | </dl>
1080 | </dd></dl>
1081 |
1082 | <dl class="py method">
1083 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_content_tests">
1084 | <span class="sig-name descname"><span class="pre">run_pragmar_content_tests</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_id</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">html_leniency</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_content_tests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_content_tests" title="Link to this definition"></a></dt>
1085 | <dd><dl class="field-list simple">
1086 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1087 | <dd class="field-odd"><ul class="simple">
1088 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1089 | <li><p><strong>site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1090 | <li><p><strong>html_leniency</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.14)"><em>bool</em></a>) – </p></li>
1091 | </ul>
1092 | </dd>
1093 | </dl>
1094 | </dd></dl>
1095 |
1096 | <dl class="py method">
1097 | <dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_report">
1098 | <span class="sig-name descname"><span class="pre">run_pragmar_report</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">site_id</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">heading</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/base/tests.html#BaseCrawlerTests.run_pragmar_report"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_report" title="Link to this definition"></a></dt>
1099 | <dd><p>Generate a comprehensive report of all resources for a site.
1100 | Returns a formatted string with counts and URLs by type.</p>
1101 | <dl class="field-list simple">
1102 | <dt class="field-odd">Parameters<span class="colon">:</span></dt>
1103 | <dd class="field-odd"><ul class="simple">
1104 | <li><p><strong>crawler</strong> (<a class="reference internal" href="#mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler" title="mcp_server_webcrawl.crawlers.base.crawler.BaseCrawler"><em>BaseCrawler</em></a>) – </p></li>
1105 | <li><p><strong>site_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)"><em>int</em></a>) – </p></li>
1106 | <li><p><strong>heading</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p></li>
1107 | </ul>
1108 | </dd>
1109 | </dl>
1110 | </dd></dl>
1111 |
1112 | </dd></dl>
1113 |
1114 | </section>
1115 | <section id="module-mcp_server_webcrawl.crawlers.base">
1116 | <span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.base" title="Link to this heading"></a></h2>
1117 | </section>
1118 | </section>
1119 |
1120 |
1121 | </div>
1122 | </div>
1123 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
1124 | <a href="mcp_server_webcrawl.crawlers.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.crawlers package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
1125 | <a href="mcp_server_webcrawl.crawlers.archivebox.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.crawlers.archivebox package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
1126 | </div>
1127 |
1128 | <hr/>
1129 |
1130 | <div role="contentinfo">
1131 | <p>© Copyright 2025, pragmar.</p>
1132 | </div>
1133 |
1134 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
1135 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
1136 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
1137 |
1138 |
1139 | </footer>
1140 | </div>
1141 | </div>
1142 | </section>
1143 | </div>
1144 | <script>
1145 | jQuery(function () {
1146 | SphinxRtdTheme.Navigation.enable(true);
1147 | });
1148 | </script>
1149 |
1150 | </body>
1151 | </html>
```