This is page 1 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl?page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
```
```
--------------------------------------------------------------------------------
/docs/.buildinfo:
--------------------------------------------------------------------------------
```
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: fb2a90e507460b57acda26968ed00c09
tags: 645f666f9bcd5a90fca523b33c5a78b7
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
venv/
env/
.env/
.venv/
temp/
__pycache__/
*.py[cod]
*$py.class
*.log
*.log.txt
.vscode/
.DS_Store
.pytest_cache/
*.egg-info/
dist/
build/
fixtures-report.log
prompts/archive/
fixtures/
settings_local.py
*.db-shm
*.db-wal
sphinx/_build/
sphinx/build/
sphinx/api/
sphinx/_static/images/generated/
*.doctree
*.pickle
*.mo
.doctrees/
**/__pycache__/
.coverage
parser.out
htmlcov/
soho.nascom.nasa.com/
*.7z
*.diff
*instructions.txt
README_pypi.md
```
--------------------------------------------------------------------------------
/sphinx/readme.txt:
--------------------------------------------------------------------------------
```
# to build docs
mcp_server_webcrawl>
sphinx-apidoc -o sphinx src/mcp_server_webcrawl
sphinx-build -b html ./sphinx ./docs
```
--------------------------------------------------------------------------------
/prompts/README.md:
--------------------------------------------------------------------------------
```markdown
# Prompt Routines
**mcp-server-webcrawl** provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.
It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are **copy and paste**, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.
If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type "run pasted for [site name or URL]." It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).
| Prompt | Download | Category | Description |
|--------|----------|----------|-------------|
|🔍 **SEO Audit** | [`auditseo.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md) | audit | Technical SEO (search engine optimization) analysis. Covers the basics, with options to dive deeper. |
|🔗 **404 Audit** | [`audit404.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md) | audit | Broken link detection and pattern analysis. Not only finds issues, but suggests fixes. |
|⚡ **Performance Audit** | [`auditperf.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md) | audit | Website speed and optimization analysis. Real talk. |
|📁 **File Audit** | [`auditfiles.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md) | audit | File organization and asset analysis. Discover the composition of your website. |
|🌐 **Gopher Interface** | [`gopher.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md) | interface | An old-fashioned search interface inspired by the Gopher clients of yesteryear. |
|⚙️ **Search Test** | [`testsearch.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md) | self-test | A battery of tests to check for Boolean logical inconsistencies in the search query parser and subsequent FTS5 conversion. |
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
<p align="center">
<img src="sphinx/_static/images/mcpswc.svg" alt="MCP Server Webcrawl" width="60%">
</p>
<p align="center">
<a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 10px;">Website</a> |
<a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 10px;">GitHub</a> |
<a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 10px;">Docs</a> |
<a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 10px;">PyPi</a>
</p>
# mcp-server-webcrawl
Advanced search and retrieval for web crawler data. With **mcp-server-webcrawl**, your AI client filters and analyzes web content under your direction or autonomously. The server includes a fulltext search interface with boolean support, and resource filtering by type, HTTP status, and more.
**mcp-server-webcrawl** provides the LLM a complete menu with which to search, and works with a variety of web crawlers:
| Crawler/Format | Description | Platforms | Setup Guide |
|---|---|---|---|
| [**ArchiveBox**][1] | Web archiving tool | macOS/Linux | [Setup Guide][8] |
| [**HTTrack**][2] | GUI mirroring tool | macOS/Windows/Linux | [Setup Guide][9] |
| [**InterroBot**][3] | GUI crawler and analyzer | macOS/Windows/Linux | [Setup Guide][10] |
| [**Katana**][4] | CLI security-focused crawler | macOS/Windows/Linux | [Setup Guide][11] |
| [**SiteOne**][5] | GUI crawler and analyzer | macOS/Windows/Linux | [Setup Guide][12] |
| [**WARC**][6] | Standard web archive format | varies by client | [Setup Guide][13] |
| [**wget**][7] | CLI website mirroring tool | macOS/Linux | [Setup Guide][14] |
[1]: https://archivebox.io
[2]: https://github.com/xroche/httrack
[3]: https://interro.bot
[4]: https://github.com/projectdiscovery/katana
[5]: https://crawler.siteone.io
[6]: https://en.wikipedia.org/wiki/WARC_(file_format)
[7]: https://en.wikipedia.org/wiki/Wget
[8]: https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html
[9]: https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html
[10]: https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html
[11]: https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html
[12]: https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html
[13]: https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html
[14]: https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html
**mcp-server-webcrawl** is free and open source, and requires Claude Desktop and Python (>=3.10). It is installed on the command line, via pip install:
```bash
pip install mcp-server-webcrawl
```
For step-by-step MCP server setup, refer to the [Setup Guides](https://pragmar.github.io/mcp-server-webcrawl/guides.html).
## Features
* Claude Desktop ready
* Multi-crawler compatible
* Filter by type, status, and more
* Boolean search support
* Support for Markdown and snippets
* Roll your own website knowledgebase
## Prompt Routines
**mcp-server-webcrawl** provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.
It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are **copy and paste**, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.
| Prompt | Download | Category | Description |
|--------|----------|----------|-------------|
|🔍 **SEO Audit** | [`auditseo.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md) | audit | Technical SEO (search engine optimization) analysis. Covers the basics, with options to dive deeper. |
|🔗 **404 Audit** | [`audit404.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md) | audit | Broken link detection and pattern analysis. Not only finds issues, but suggests fixes. |
|⚡ **Performance Audit** | [`auditperf.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md) | audit | Website speed and optimization analysis. Real talk. |
|📁 **File Audit** | [`auditfiles.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md) | audit | File organization and asset analysis. Discover the composition of your website. |
|🌐 **Gopher Interface** | [`gopher.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md) | interface | An old-fashioned search interface inspired by the Gopher clients of yesteryear. |
|⚙️ **Search Test** | [`testsearch.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md) | self-test | A battery of tests to check for Boolean logical inconsistencies in the search query parser and subsequent FTS5 conversion. |
If you want to shortcut the site selection (one less query), paste the markdown and in the same request, type "run pasted for [site name or URL]." It will figure it out. When pasted without additional context, you should be prompted to select from a list of crawled sites.
## Boolean Search Syntax
The query engine supports field-specific (`field: value`) searches and complex boolean expressions. Fulltext is supported as a combination of the url, content, and headers fields.
While the API interface is designed to be consumed by the LLM directly, it can be helpful to familiarize yourself with the search syntax. Searches generated by the LLM are inspectable, but generally collapsed in the UI. If you need to see the query, expand the MCP collapsible.
**Example Queries**
| Query Example | Description |
|--------------|-------------|
| privacy | fulltext single keyword match |
| "privacy policy" | fulltext match exact phrase |
| boundar* | fulltext wildcard matches results starting with *boundar* (boundary, boundaries) |
| id: 12345 | id field matches a specific resource by ID |
| url: example.com/somedir | url field matches results with URL containing example.com/somedir |
| type: html | type field matches for HTML pages only |
| status: 200 | status field matches specific HTTP status codes (equal to 200) |
| status: >=400 | status field matches specific HTTP status code (greater than or equal to 400) |
| content: h1 | content field matches content (HTTP response body, often, but not always HTML) |
| headers: text/xml | headers field matches HTTP response headers |
| privacy AND policy | fulltext matches both |
| privacy OR policy | fulltext matches either |
| policy NOT privacy | fulltext matches policies not containing privacy |
| (login OR signin) AND form | fulltext matches fulltext login or signin with form |
| type: html AND status: 200 | fulltext matches only HTML pages with HTTP success |
## Field Search Definitions
Field search provides search precision, allowing you to specify which columns of the search index to filter. Rather than searching the entire content, you can restrict your query to specific attributes like URLs, headers, or content body. This approach improves efficiency when looking for specific attributes or patterns within crawl data.
| Field | Description |
|-------|-------------|
| id | database ID |
| url | resource URL |
| type | enumerated list of types (see types table) |
| size | file size in bytes |
| status | HTTP response codes |
| headers | HTTP response headers |
| content | HTTP body—HTML, CSS, JS, and more |
## Field Content
A subset of fields can be independently requested with results, while core fields are always on. Use of headers and content can consume tokens quickly. Use judiciously, or use extras to crunch more results into the context window. Fields are a top level argument, independent of any field searching taking place in the query.
| Field | Description |
|-------|-------------|
| id | always available |
| url | always available |
| type | always available |
| status | always available |
| created | on request |
| modified | on request |
| size | on request |
| headers | on request |
| content | on request |
## Content Types
Crawls contain resource types beyond HTML pages. The `type:` field search allows filtering by broad content type groups, particularly useful when filtering images without complex extension queries. For example, you might search for `type: html NOT content: login` to find pages without "login," or `type: img` to analyze image resources. The table below lists all supported content types in the search system.
| Type | Description |
|------|-------------|
| html | webpages |
| iframe | iframes |
| img | web images |
| audio | web audio files |
| video | web video files |
| font | web font files |
| style | CSS stylesheets |
| script | JavaScript files |
| rss | RSS syndication feeds |
| text | plain text content |
| pdf | PDF files |
| doc | MS Word documents |
| other | uncategorized |
## Extras
The `extras` parameter provides additional processing options, transforming HTTP data (markdown, snippets, regex, xpath), or connecting the LLM to external data (thumbnails). These options can be combined as needed to achieve the desired result format.
| Extra | Description |
|-------|-------------|
| thumbnails | Generates base64 encoded images to be viewed and analyzed by AI models. Enables image description, content analysis, and visual understanding while keeping token output minimal. Works with images, which can be filtered using `type: img` in queries. SVG is not supported. |
| markdown | Provides the HTML content field as concise Markdown, reducing token usage and improving readability for LLMs. Works with HTML, which can be filtered using `type: html` in queries. |
| regex | Extracts regular expression matches from crawled files such as HTML, CSS, JavaScript, etc. Not as precise a tool as XPath for HTML, but supports any text file as a data source. One or more regex patterns can be requested, using the `extrasRegex` argument. |
| snippets | Matches fulltext queries to contextual keyword usage within the content. When used without requesting the content field (or markdown extra), it can provide an efficient means of refining a search without pulling down the complete page contents. Also great for rendering old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, or any text-based, crawled file. |
| xpath | Extracts XPath selector data, used in scraping HTML content. Use XPath's text() selector for text-only, element selectors return outerHTML. Only supported with `type: html`, other types will be ignored. One or more XPath selectors (//h1, count(//h1), etc.) can be requested, using the `extrasXpath` argument. |
Extras provide a means of producing token-efficient HTTP content responses. Markdown produces roughly 1/3 the bytes of the source HTML, snippets are generally 500 or so bytes per result, and XPath can be as specific or broad as you choose. The more focused your requests, the more results you can fit into your LLM session.
The idea, of course, is that the LLM takes care of this for you. If you notice your LLM developing an affinity to the "content" field (full HTML), a nudge in chat to budget tokens using the extras feature should be all that is needed.
## Interactive Mode
**No AI, just classic Boolean search of your web-archives in a terminal.**
mcp-server-webcrawl can double as a terminal search for your web archives. You can run it against your local archives, but it gets more interesting when you realize you can ssh into any remote host and view archives sitting on that host. No downloads, syncs, multifactor logins, or other common drudgery required. With interactive mode, you can be in and searching a crawl sitting on a remote server in no time at all.
Launch with --crawler and --datasrc to search immediately, or setup datasrc and crawler in-app.
```bash
mcp-server-webcrawl --crawler wget --datasrc /path/to/datasrc --interactive
# or manually enter crawler and datasrc in the UI
mcp-server-webcrawl --interactive
```
Interactive mode is a way to search through tranches of crawled data, whenever, whereever... in a terminal.

```
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
```markdown
# Contributing
Thank you for your interest in contributing.
This project is still finding its footing, and I'm keeping things simple for now.
## The Process
**Start with an issue.** Before writing code, open an issue to discuss the problem being addressed.
This saves everyone time and ensures we're aligned.
## What I'm Looking For
* Bugs with clear reproduction steps
* LLM API hangups that I may be unaware of (fixations, etc. leading to extra queries or misleading results)
* New and exciting prompts utilizing the embedded search method
## Guidelines
* This is evolving as I figure out what works
* No additional package requirements, please
* No new crawlers without viable long-term support (forking is encouraged!)
* Follow existing code style
* Include tests where appropriate
* Keep PRs focused and reasonably sized
I'm currently pushing to master, it's been a solo effort as of writing. If and when higher-level git workflow is required, I'll reorganize the repo into something more community-oriented.
Questions? Open an issue and let's talk.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/templates/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/sphinx/_templates/layout.html:
--------------------------------------------------------------------------------
```html
{% extends "!layout.html" %}
```
--------------------------------------------------------------------------------
/docs/_sources/modules.rst.txt:
--------------------------------------------------------------------------------
```
mcp_server_webcrawl
===================
.. toctree::
:maxdepth: 4
mcp_server_webcrawl
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/__init__.py:
--------------------------------------------------------------------------------
```python
# this is what is acceptable metadata content for crawl results
METADATA_VALUE_TYPE = str | int | float | bool | list[str] | list[int] | list[float] | None
```
--------------------------------------------------------------------------------
/docs/_static/documentation_options.js:
--------------------------------------------------------------------------------
```javascript
const DOCUMENTATION_OPTIONS = {
VERSION: '',
LANGUAGE: 'en',
COLLAPSE_INDEX: false,
BUILDER: 'html',
FILE_SUFFIX: '.html',
LINK_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: '.txt',
NAVIGATION_WITH_KEYS: false,
SHOW_SEARCH_SUMMARY: true,
ENABLE_SEARCH_SHORTCUTS: true,
};
```
--------------------------------------------------------------------------------
/docs/_sources/guides.rst.txt:
--------------------------------------------------------------------------------
```
Setup Guides
============
This section contains detailed setup guides for mcp-server-webcrawl in various environments and configurations.
.. toctree::
:maxdepth: 1
:caption: Available Guides:
guides/archivebox
guides/httrack
guides/interrobot
guides/katana
guides/siteone
guides/warc
guides/wget
```
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
```python
import re
from setuptools import setup
# remove centered image that doesn't render for pypi
with open("README.md", "r", encoding="utf-8") as src:
content = src.read()
modified_content = re.sub(r'<p align="center">.*?</p>', '', content, flags=re.DOTALL)
with open("README_pypi.md", "w", encoding="utf-8") as dest:
dest.write(modified_content.strip())
# set up configuration in pyproject.toml
setup()
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/server.py:
--------------------------------------------------------------------------------
```python
import os
import sys
def initialize_mcp_server() -> None:
"""
MCP stdio streams require utf-8 explicitly set for Windows (default cp1252)
or internationalized content will fail.
"""
if sys.platform == "win32" and os.environ.get("PYTHONIOENCODING") is None:
sys.stdin.reconfigure(encoding="utf-8")
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.templates.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.templates package
=======================================
Submodules
----------
mcp\_server\_webcrawl.templates.tests module
--------------------------------------------
.. automodule:: mcp_server_webcrawl.templates.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.templates
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/settings.py:
--------------------------------------------------------------------------------
```python
import logging
from pathlib import Path
# DEBUG overrides LOG_LEVEL
DEBUG: bool = False
DATA_DIRECTORY: Path = Path.home() / ".mcp_server_webcrawl"
# fixtures directory (optional, for running tests)
FIXTURES_DIRECTORY: Path | None = None
# logging.NOTSET will not write to a log file, all other levels will
# LOG_LEVEL: int = logging.ERROR
# LOG_PATH will automatically fallback to DATA_DIRECTORY / mcp-server-webcrawl.log
# LOG_PATH: Path = Path.home() / "Desktop" / "mcpdemo" / "server_log.txt"
try:
from .settings_local import *
except ImportError:
pass
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers package
======================================
Subpackages
-----------
.. toctree::
:maxdepth: 2
mcp_server_webcrawl.crawlers.base
mcp_server_webcrawl.crawlers.archivebox
mcp_server_webcrawl.crawlers.httrack
mcp_server_webcrawl.crawlers.interrobot
mcp_server_webcrawl.crawlers.katana
mcp_server_webcrawl.crawlers.siteone
mcp_server_webcrawl.crawlers.warc
mcp_server_webcrawl.crawlers.wget
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.models.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.models package
====================================
Submodules
----------
mcp\_server\_webcrawl.models.resources module
---------------------------------------------
.. automodule:: mcp_server_webcrawl.models.resources
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.models.sites module
-----------------------------------------
.. automodule:: mcp_server_webcrawl.models.sites
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.models
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/main.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp.server.stdio import stdio_server
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.utils.logger import get_logger, initialize_logger
from mcp_server_webcrawl.utils.server import initialize_mcp_server
logger = get_logger()
async def main(crawler: BaseCrawler, datasrc: Path):
initialize_logger()
initialize_mcp_server()
async with stdio_server() as (read_stream, write_stream):
crawler = crawler(datasrc)
logger.info(f"MCP webcrawl server initialized with adapter {crawler.__class__.__name__}")
logger.info(f"datasrc: {datasrc.absolute()}")
await crawler.serve(read_stream, write_stream)
logger.info("MCP webcrawl server exited")
```
--------------------------------------------------------------------------------
/sphinx/make.bat:
--------------------------------------------------------------------------------
```
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
if "%1" == "" goto help
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
from mcp_server_webcrawl.crawlers.katana.adapter import get_sites, get_resources
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class KatanaCrawler(IndexedCrawler):
"""
A crawler implementation for HTTP text files.
Provides functionality for accessing and searching web content from captured HTTP exchanges.
"""
def __init__(self, datasrc: Path):
"""
Initialize the HTTP text crawler with a data source directory.
Args:
datasrc: The input argument as Path, it must be a directory containing
subdirectories with HTTP text files
"""
super().__init__(datasrc, get_sites, get_resources)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/__init__.py:
--------------------------------------------------------------------------------
```python
import re
from datetime import datetime
def to_isoformat_zulu(dt: datetime):
"""
Convert datetime to iso Z.
python<=3.10 struggles with Z and fractions of seconds, will
throw. smooth out the iso string, second precision isn't key here
"""
return dt.isoformat().replace("+00:00", "Z")
def from_isoformat_zulu(dt_string: str | None) -> datetime:
"""
Convert ISO string to datetime.
python<=3.10 struggles with Z and fractions of seconds, will
throw. smooth out the iso string, second precision isn't key here
"""
if not dt_string:
return None
dt_string = dt_string.replace("Z", "+00:00")
match = re.match(r"(.*\.\d{6})\d*([-+]\d{2}:\d{2}|$)", dt_string)
if match:
dt_string = match.group(1) + (match.group(2) or "")
return datetime.fromisoformat(dt_string)
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.warc.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.warc package
===========================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.warc.adapter module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.warc.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.warc.crawler module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.warc.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.warc.tests module
------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.warc.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.warc
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.wget.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.wget package
===========================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.wget.adapter module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.wget.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.wget.crawler module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.wget.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.wget.tests module
------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.wget.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.wget
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_static/js/badge_only.js:
--------------------------------------------------------------------------------
```javascript
!function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}});
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.katana.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.katana package
=============================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.katana.adapter module
----------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.katana.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.katana.crawler module
----------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.katana.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.katana.tests module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.katana.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.katana
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.httrack.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.httrack package
==============================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.httrack.adapter module
-----------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.httrack.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.httrack.crawler module
-----------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.httrack.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.httrack.tests module
---------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.httrack.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.httrack
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.siteone.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.siteone package
==============================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.siteone.adapter module
-----------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.siteone.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.siteone.crawler module
-----------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.siteone.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.siteone.tests module
---------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.siteone.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.siteone
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.archivebox.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.archivebox package
=================================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.archivebox.adapter module
--------------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.archivebox.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.archivebox.crawler module
--------------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.archivebox.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.archivebox.tests module
------------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.archivebox.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.archivebox
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.interrobot.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.interrobot package
=================================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.interrobot.adapter module
--------------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.interrobot.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.interrobot.crawler module
--------------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.interrobot.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.interrobot.tests module
------------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.interrobot.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.interrobot
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.utils.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.utils package
===================================
Submodules
----------
mcp\_server\_webcrawl.utils.cli module
--------------------------------------
.. automodule:: mcp_server_webcrawl.utils.cli
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.utils.logger module
-----------------------------------------
.. automodule:: mcp_server_webcrawl.utils.logger
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.utils.server module
-----------------------------------------
.. automodule:: mcp_server_webcrawl.utils.server
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.utils.tools module
----------------------------------------
.. automodule:: mcp_server_webcrawl.utils.tools
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.utils
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
from mcp_server_webcrawl.crawlers.warc.adapter import get_sites, get_resources
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class WarcCrawler(IndexedCrawler):
"""
A crawler implementation for WARC (Web ARChive) files.
Provides functionality for accessing and searching web archive content.
"""
def __init__(self, datasrc: Path):
"""
Initialize the WARC crawler with a data source directory.
Supported file types: .txt, .warc, and .warc.gz
Args:
datasrc: the input argument as Path, must be a directory containing WARC files
Raises:
AssertionError: If datasrc is None or not a directory
"""
assert datasrc is not None, f"WarcCrawler needs a datasrc, regardless of action"
assert datasrc.is_dir(), "WarcCrawler datasrc must be a directory"
super().__init__(datasrc, get_sites, get_resources)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
from mcp_server_webcrawl.crawlers.wget.adapter import get_sites, get_resources
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class WgetCrawler(IndexedCrawler):
"""
A crawler implementation for wget captured sites.
Provides functionality for accessing and searching web content from wget captures.
"""
def __init__(self, datasrc: Path):
"""
Initialize the wget crawler with a data source directory.
Args:
datasrc: the input argument as Path, it must be a directory containing
wget captures organized as subdirectories
Raises:
AssertionError: If datasrc is None or not a directory
"""
assert datasrc is not None, f"WgetCrawler needs a datasrc, regardless of action"
assert datasrc.is_dir(), "WgetCrawler datasrc must be a directory"
super().__init__(datasrc, get_sites, get_resources)
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl package
=============================
Subpackages
-----------
.. toctree::
:maxdepth: 2
mcp_server_webcrawl.crawlers
mcp_server_webcrawl.extras
mcp_server_webcrawl.interactive
mcp_server_webcrawl.models
mcp_server_webcrawl.templates
mcp_server_webcrawl.utils
Submodules
----------
mcp\_server\_webcrawl.main module
---------------------------------
.. automodule:: mcp_server_webcrawl.main
:members:
:undoc-members:
:show-inheritance:
:noindex:
mcp\_server\_webcrawl.settings module
-------------------------------------
.. automodule:: mcp_server_webcrawl.settings
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.settings\_local module
--------------------------------------------
.. automodule:: mcp_server_webcrawl.settings_local
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/base.py:
--------------------------------------------------------------------------------
```python
from typing import Union
from datetime import datetime
from pathlib import Path
METADATA_VALUE_TYPE = Union[str, int, float, bool, datetime, Path, dict, list, None]
class BaseModel:
def to_forcefield_dict(self, forcefields: list[str]) -> dict[str, METADATA_VALUE_TYPE]:
"""
Convert the object to a dictionary with specified fields forced to exist.
Creates a dictionary that includes all non-None values from the forcefields list,
and ensuring all fields in the forcefields list exist, even if null.
Args:
forcefields: list of field names that must appear in the output dictionary
with at least a None value
Returns:
Dictionary containing all non-None object attributes, plus forced fields
set to None if not already present
"""
# None self-annihilates in filter, forcefields can force their existence, as null
result = {}
if forcefields:
result = {k: None for k in forcefields}
result.update(self.to_dict())
return result
```
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "mcp-server-webcrawl"
description = "MCP server for search and retrieval of web crawler content"
readme = "README_pypi.md"
requires-python = ">=3.10"
authors = [
{name = "Ben Caulfield"}
]
classifiers = [
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
]
dependencies = [
"mcp>=1.3.0",
"lxml>=4.6.0",
"Pillow>=9.0.0",
"aiohttp>=3.8.0",
"warcio>=1.7.0",
"ply==3.11",
"windows-curses; sys_platform == 'win32'"
]
dynamic = ["version"]
[project.scripts]
mcp-server-webcrawl = "mcp_server_webcrawl:main"
[project.urls]
Homepage = "https://pragmar.com/mcp-server-webcrawl/"
[tool.setuptools]
package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]
include = ["mcp_server_webcrawl*"]
exclude = ["*settings_local*"]
[tool.setuptools.dynamic]
version = {attr = "mcp_server_webcrawl.__version__"}
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.interactive.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.interactive package
=========================================
Submodules
----------
mcp\_server\_webcrawl.interactive.highlights module
---------------------------------------------------
.. automodule:: mcp_server_webcrawl.interactive.highlights
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.interactive.search module
-----------------------------------------------
.. automodule:: mcp_server_webcrawl.interactive.search
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.interactive.session module
------------------------------------------------
.. automodule:: mcp_server_webcrawl.interactive.session
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.interactive.ui module
-------------------------------------------
.. automodule:: mcp_server_webcrawl.interactive.ui
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.interactive
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
from mcp_server_webcrawl.crawlers.siteone.adapter import get_sites, get_resources
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class SiteOneCrawler(IndexedCrawler):
"""
A crawler implementation for SiteOne captured sites.
Provides functionality for accessing and searching web content from SiteOne captures.
SiteOne merges a wget archive with a custom SiteOne generated log to aquire more
fields than wget can alone.
"""
def __init__(self, datasrc: Path):
"""
Initialize the SiteOne crawler with a data source directory.
Args:
datasrc: The input argument as Path, it must be a directory containing
SiteOne captures organized as subdirectories
Raises:
AssertionError: If datasrc is None or not a directory
"""
assert datasrc is not None, f"SiteOneCrawler needs a datasrc, regardless of action"
assert datasrc.is_dir(), "SiteOneCrawler datasrc must be a directory"
super().__init__(datasrc, get_sites, get_resources)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
from mcp_server_webcrawl.crawlers.httrack.adapter import get_sites, get_resources
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class HtTrackCrawler(IndexedCrawler):
"""
A crawler implementation for HTTrack captured sites.
Provides functionality for accessing and searching web content from HTTrack projects.
HTTrack creates offline mirrors of websites with preserved directory structure
and metadata in hts-log.txt files.
"""
def __init__(self, datasrc: Path):
"""
Initialize the HTTrack crawler with a data source directory.
Args:
datasrc: The input argument as Path, it must be a directory containing
HTTrack project directories, each potentially containing multiple domains
Raises:
AssertionError: If datasrc is None or not a directory
"""
assert datasrc is not None, f"HtTrackCrawler needs a datasrc, regardless of action"
assert datasrc.is_dir(), "HtTrackCrawler datasrc must be a directory"
super().__init__(datasrc, get_sites, get_resources)
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.extras.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.extras package
====================================
Submodules
----------
mcp\_server\_webcrawl.extras.markdown module
--------------------------------------------
.. automodule:: mcp_server_webcrawl.extras.markdown
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.extras.regex module
-----------------------------------------
.. automodule:: mcp_server_webcrawl.extras.regex
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.extras.snippets module
--------------------------------------------
.. automodule:: mcp_server_webcrawl.extras.snippets
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.extras.thumbnails module
----------------------------------------------
.. automodule:: mcp_server_webcrawl.extras.thumbnails
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.extras.xpath module
-----------------------------------------
.. automodule:: mcp_server_webcrawl.extras.xpath
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.extras
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
from mcp_server_webcrawl.crawlers.archivebox.adapter import get_sites, get_resources
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class ArchiveBoxCrawler(IndexedCrawler):
"""
A crawler implementation for ArchiveBox archived sites.
Provides functionality for accessing and searching web content from ArchiveBox archives.
ArchiveBox creates single-URL archives with metadata stored in JSON files
and HTML content preserved in index.html files.
"""
def __init__(self, datasrc: Path):
"""
Initialize the ArchiveBox crawler with a data source directory.
Args:
datasrc: The input argument as Path, it must be a directory containing
ArchiveBox archive directories, each containing individual URL entries
Raises:
AssertionError: If datasrc is None or not a directory
"""
assert datasrc is not None, f"ArchiveBoxCrawler needs a datasrc, regardless of action"
assert datasrc.is_dir(), "ArchiveBoxCrawler datasrc must be a directory"
super().__init__(datasrc, get_sites, get_resources)
```
--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.base.rst.txt:
--------------------------------------------------------------------------------
```
mcp\_server\_webcrawl.crawlers.base package
===========================================
Submodules
----------
mcp\_server\_webcrawl.crawlers.base.adapter module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.base.adapter
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.base.api module
----------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.base.api
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.base.crawler module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.base.crawler
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.base.indexed module
--------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.base.indexed
:members:
:undoc-members:
:show-inheritance:
mcp\_server\_webcrawl.crawlers.base.tests module
------------------------------------------------
.. automodule:: mcp_server_webcrawl.crawlers.base.tests
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: mcp_server_webcrawl.crawlers.base
:members:
:undoc-members:
:show-inheritance:
```
--------------------------------------------------------------------------------
/sphinx/conf.py:
--------------------------------------------------------------------------------
```python
# to build docs
# mcp_server_webcrawl> sphinx-apidoc -o sphinx src/mcp_server_webcrawl
# mcp_server_webcrawl> cd .\sphinx\
# mcp_server_webcrawl\sphinx> sphinx-build -b html . ../docs
import os
import sys
# path the src directory for autodiscovery
sys.path.insert(0, os.path.abspath("../../src"))
project = "mcp-server-webcrawl"
copyright = "2025, pragmar"
author = "pragmar"
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.viewcode",
"sphinx.ext.napoleon",
"sphinx.ext.coverage",
"sphinx.ext.intersphinx",
]
templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**/settings_local.py"]
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
html_theme_options = {
"navigation_depth": 2, # This is the key setting to prevent deep nesting
"titles_only": False,
}
# Better autodoc formatting
autodoc_default_options = {
"members": True,
"undoc-members": True,
"show-inheritance": False,
"member-order": "bysource",
}
autodoc_member_order = "groupwise"
autodoc_typehints = "description"
autodoc_class_signature = "separated"
autodoc_mock_imports = ["mcp_server_webcrawl.settings_local"]
add_module_names = False
autoclass_content = "both"
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
}
toctree_maxdepth = 2
```
--------------------------------------------------------------------------------
/docs/_sources/interactive.rst.txt:
--------------------------------------------------------------------------------
```
Interactive Mode
================
**No AI, just classic Boolean search of your web-archives in a terminal.**
mcp-server-webcrawl can double as a terminal search for your web archives. You can run it against your local archives, but it gets more interesting when you realize you can ssh into any remote host and view archives sitting on that host. No downloads, syncs, multifactor logins, or other common drudgery required. With interactive mode, you can be in and searching a crawl sitting on a remote server in no time at all.
.. raw:: html
<iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/8kNkP-zNzs4" frameborder="0" allowfullscreen></iframe>
Usage
-----
Interactive mode exposes the mcp-server-webcrawl search layer as a terminal UI (TUI), bypassing MCP/AI altogether. Core field and Boolean search are supported, along with the human-friendly aspects of the search interface, such as result snippets.
You launch interactive mode from the termial, using the --interactive command line argument.
.. code-block:: bash
mcp-server-webcrawl --crawler wget --datasrc /path/to/datasrc --interactive
# or manually enter crawler and datasrc in the UI
mcp-server-webcrawl --interactive
Screencaps
----------
.. figure:: _static/images/interactive.search.webp
:alt: mcp-server-webcrawl in --interactive mode heading
:align: center
:width: 100%
Search view, showing snippets with "Solar Eclipse" highlights
.. figure:: _static/images/interactive.document.webp
:alt: mcp-server-webcrawl in --interactive mode heading
:align: center
:width: 100%
Document presentated in in Markdown, with raw and HTTP headers views available.
```
--------------------------------------------------------------------------------
/docs/_sources/installation.rst.txt:
--------------------------------------------------------------------------------
```
Installation
============
Install the package via pip:
.. code-block:: bash
pip install mcp-server-webcrawl
Requirements
------------
To use mcp-server-webcrawl effectively, you need:
* An MCP-capable LLM host such as Claude Desktop [1]
* Python [2] installed on your command line interface
* Basic familiarity with running Python packages
After ensuring these prerequisites are met, run the pip install command above to add the package to your environment.
MCP Configuration
-----------------
To enable your LLM host to access your web crawl data, you'll need to add an MCP server configuration. From Claude's developer settings, locate the MCP configuration section and add the appropriate configuration for your crawler type.
Setup guides and videos are available for each supported crawler:
* :doc:`ArchiveBox <guides/archivebox>`
* :doc:`HTTrack <guides/httrack>`
* :doc:`InterroBot <guides/interrobot>`
* :doc:`Katana <guides/katana>`
* :doc:`SiteOne <guides/siteone>`
* :doc:`WARC <guides/warc>`
* :doc:`Wget <guides/wget>`
Multiple Configurations
-----------------------
You can set up multiple **mcp-server-webcrawl** connections under the ``mcpServers`` section if you want to access different crawler data sources simultaneously.
.. code-block:: json
{
"mcpServers": {
"webcrawl_warc": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "warc", "--datasrc", "/path/to/warc/archives/"]
},
"webcrawl_wget": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "wget", "--datasrc", "/path/to/wget/archives/"]
}
}
}
After adding the configuration, save the file and restart your LLM host to apply the changes.
References
----------
[1] Claude Desktop: https://claude.ai
[2] Python: https://python.org
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/markdown.py:
--------------------------------------------------------------------------------
```python
import re
from importlib import resources
from typing import Final, Any
from lxml import etree, html
from lxml.etree import ParserError
from logging import Logger
from mcp_server_webcrawl.utils.logger import get_logger
__XSLT_RESULT_CLEANER: Final[re.Pattern] = re.compile(r"(?:\n\s*-\s*\n|\n\s*\n)+")
__RE_HTML: Final[re.Pattern] = re.compile(r"<[a-zA-Z]+[^>]*>")
logger: Logger = get_logger()
class MarkdownTransformer:
"""
Memoizes the XSLT transformer
"""
_xslt_transform = None
@classmethod
def get_xslt_transform(cls):
"""
Get the HTML to text markdown XSLT transformer
"""
if cls._xslt_transform is None:
xslt_string: str = resources.read_text("mcp_server_webcrawl.templates", "markdown.xslt").encode("utf-8")
xslt_doc = etree.fromstring(xslt_string)
cls._xslt_transform = etree.XSLT(xslt_doc)
return cls._xslt_transform
def get_markdown(content: str) -> str | None:
"""
Transform HTML content to Markdown using XSLT.
Args:
content (str): The HTML content to transform.
Returns:
str | None: The transformed Markdown string, or None if the input is empty
or if transformation fails (e.g., due to invalid HTML or XSLT errors).
"""
transformer = MarkdownTransformer.get_xslt_transform()
content:str = content or ""
assert isinstance(content, str), "String (HTML) required for transformer"
assert transformer is not None
if content == "" or not __RE_HTML.search(content):
return None
try:
doc = html.fromstring(content)
result = str(transformer(doc))
result = __XSLT_RESULT_CLEANER.sub("\n\n", result).strip()
return result
except Exception as ex:
logger.warning(f"XSLT transform error: {type(ex).__name__}\n{ex}")
return None
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/__init__.py:
--------------------------------------------------------------------------------
```python
import sys
from pathlib import Path
from mcp_server_webcrawl.settings import FIXTURES_DIRECTORY
VALID_CRAWLER_CHOICES: list[str] = ["archivebox", "httrack", "interrobot", "katana", "siteone", "warc", "wget"]
def get_fixture_directory() -> Path:
# only to be used for devs on test runs, configured in settings_local.py
# settings_local.py added as sibling of settings.py if not present
# download https://github.com/pragmar/mcp-server-webcrawl-fixtures
assert FIXTURES_DIRECTORY is not None and FIXTURES_DIRECTORY.is_dir(), \
f"Fixtures not configured in settings_local.py, or is not a valid directory.\nFIXTURES_DIRECTORY: {FIXTURES_DIRECTORY}"
return FIXTURES_DIRECTORY
def get_crawler(crawler_name: str) -> str | None:
"""
lazy load crawler, some classes have additional package dependencies
"""
if crawler_name is None:
return None
crawler_name = crawler_name.lower()
if crawler_name == "archivebox":
from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
return ArchiveBoxCrawler
elif crawler_name == "httrack":
from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler
return HtTrackCrawler
elif crawler_name == "interrobot":
from mcp_server_webcrawl.crawlers.interrobot.crawler import InterroBotCrawler
return InterroBotCrawler
elif crawler_name == "katana":
from mcp_server_webcrawl.crawlers.katana.crawler import KatanaCrawler
return KatanaCrawler
elif crawler_name == "siteone":
from mcp_server_webcrawl.crawlers.siteone.crawler import SiteOneCrawler
return SiteOneCrawler
elif crawler_name == "warc":
from mcp_server_webcrawl.crawlers.warc.crawler import WarcCrawler
return WarcCrawler
elif crawler_name == "wget":
from mcp_server_webcrawl.crawlers.wget.crawler import WgetCrawler
return WgetCrawler
else:
valid_choices = ", ".join(VALID_CRAWLER_CHOICES)
raise ValueError(f"unsupported crawler '{crawler_name}' ({valid_choices})")
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/xpath.py:
--------------------------------------------------------------------------------
```python
import lxml.html
from lxml import etree
from lxml.etree import ParserError, XPathEvalError, XPathSyntaxError
from logging import Logger
from mcp_server_webcrawl.utils.logger import get_logger
logger: Logger = get_logger()
def get_xpath(content: str, xpaths: list[str]) -> list[dict[str, str | int | float]]:
"""
Takes content and gets xpath hits
Arguments:
content: The HTML source
xpaths: The xpath selectors
Returns:
A list of dicts, with selector and value
"""
if not isinstance(content, str):
return []
if not isinstance(xpaths, list) or not all(isinstance(item, str) for item in xpaths):
raise ValueError("xpaths must be a list of strings")
results = []
if content == "":
return results
try:
doc: lxml.html.HtmlElement = lxml.html.fromstring(content.encode("utf-8"))
except ParserError:
return results
for xpath in xpaths:
try:
selector_result = doc.xpath(xpath)
except (XPathEvalError, XPathSyntaxError) as ex:
logger.warning(f"Invalid xpath '{xpath}': {ex}")
continue
if isinstance(selector_result, (list, tuple)):
# normal xpath query returns a list
for result in selector_result:
# a new dict for each result
xpath_hit: dict[str, str | int | float] = {"selector": xpath}
if hasattr(result, "tag"):
html_string: str = etree.tostring(result, encoding="unicode", method="html")
xpath_hit["value"] = html_string.strip()
else:
xpath_hit["value"] = str(result).strip()
results.append(xpath_hit)
else:
# single value case (count(//h1), sum(), etc.) is also valid xpath
xpath_hit: dict[str, str | int | float] = {"selector": xpath}
if isinstance(selector_result, (int, float)):
xpath_hit["value"] = selector_result
else:
xpath_hit["value"] = str(selector_result).strip()
results.append(xpath_hit)
return results
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/sites.py:
--------------------------------------------------------------------------------
```python
from datetime import datetime
from typing import Final
from pathlib import Path
from enum import Enum
from mcp_server_webcrawl.models.base import BaseModel, METADATA_VALUE_TYPE
from mcp_server_webcrawl.utils import to_isoformat_zulu
class SiteType(Enum):
UNDEFINED = "undefined"
CRAWLED_URL = "url"
CRAWLED_LIST = "list"
SITES_TOOL_NAME: Final[str] = "webcrawl_sites"
SITES_FIELDS_BASE: Final[list[str]] = ["id", "name", "type", "urls"]
SITES_FIELDS_DEFAULT: Final[list[str]] = SITES_FIELDS_BASE + ["created", "modified"]
class SiteResult(BaseModel):
"""
Represents a website or crawl directory result.
"""
def __init__(
self,
id: int,
name: str | None = None,
type: SiteType = SiteType.CRAWLED_URL,
urls: list[str] | None = None,
path: Path = None,
created: datetime | None = None,
modified: datetime | None = None,
robots: str | None = None,
metadata: dict[str, METADATA_VALUE_TYPE] | None = None
):
"""
Initialize a SiteResult instance.
Args:
id: site identifier
name: site name, either a URL or a custom job
urls: site URL(s), multiple for list type crawls
path: path to site data, different from datasrc
created: creation timestamp
modified: last modification timestamp
robots: robots.txt content
metadata: additional metadata for the site
"""
self.id = id
self.name = name
self.type = type
self.urls = urls
self.path = path
self.created = created
self.modified = modified
self.robots = robots
self.metadata = metadata or {}
def to_dict(self) -> dict[str, METADATA_VALUE_TYPE]:
"""
Convert the object to a dictionary suitable for JSON serialization.
"""
result: dict[str, METADATA_VALUE_TYPE] = {
"id": self.id,
"name": self.name,
"type": self.type.value,
"urls": self.urls,
"created": to_isoformat_zulu(self.created) if self.created else None,
"modified": to_isoformat_zulu(self.modified) if self.modified else None,
"metadata": self.metadata if self.metadata else None,
}
return {k: v for k, v in result.items() if v is not None and not (k == "metadata" and v == {})}
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/tests.py:
--------------------------------------------------------------------------------
```python
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.crawlers.wget.adapter import WgetManager
from mcp_server_webcrawl.crawlers.wget.crawler import WgetCrawler
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
EXAMPLE_SITE_ID = WgetManager.string_to_id("example.com")
PRAGMAR_SITE_ID = WgetManager.string_to_id("pragmar.com")
class WgetTests(BaseCrawlerTests):
"""
Test suite for the wget crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests.
"""
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "wget"
def test_wget_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = WgetCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
def test_wget_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = WgetCrawler(self._datasrc)
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
def test_wget_search(self):
"""
Test boolean search functionality
"""
# moved fixtures to own repo, lost some local media,
# but checks out. wget fixture has no CSS/JS/etc.
# HTML-only and just doesn't do well with the full array of
# tests concerning fulltext, media, and mixed search result
# counts. probably needs a reduced set of tests
# self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
return
def test_wget_resources(self):
"""
Test resource retrieval API functionality with various parameters.
"""
crawler = WgetCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
def test_wget_sorts(self):
"""
Test random sort functionality using the '?' sort parameter.
"""
crawler = WgetCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
def test_wget_content_parsing(self):
"""
Test content type detection and parsing.
"""
crawler = WgetCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
def test_report(self):
"""
Run test report, save to data directory.
"""
crawler = WgetCrawler(self._datasrc)
logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "wget"))
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/logger.py:
--------------------------------------------------------------------------------
```python
import logging
from pathlib import Path
from typing import Final
import mcp_server_webcrawl.settings as settings
from mcp_server_webcrawl.settings import DEBUG, DATA_DIRECTORY
DEFAULT_LOG_KEY: Final[str] = "mcp-server-webcrawl"
DEFAULT_LOG_PATH: Final[Path] = DATA_DIRECTORY / "mcp-server-webcrawl.log"
DEFAULT_LOG_LEVEL: Final[int] = logging.WARNING
def get_logger_configuration() -> tuple[str, Path, int]:
"""
Get log name, path, and level (in that order)
Returns:
tuple[str, Path, int]: A tuple containing name, path, and level
"""
log_path: Path = DEFAULT_LOG_PATH
log_level: int = DEFAULT_LOG_LEVEL
log_level = logging.DEBUG if DEBUG else getattr(settings, "LOG_LEVEL", DEFAULT_LOG_LEVEL)
log_path = getattr(settings, "LOG_PATH", DEFAULT_LOG_PATH)
return (DEFAULT_LOG_KEY, log_path, log_level)
def get_logger() -> logging.Logger:
"""
Get logger, usually in order to write to it
Returns:
Logger: a writable logging object (error/warn/info/debug)
"""
(log_name, _, _) = get_logger_configuration()
return logging.getLogger(log_name)
def initialize_logger() -> None:
"""
Validate and set up logger for writing
Returns:
None
"""
(log_name, log_path, log_level) = get_logger_configuration()
if log_level == logging.NOTSET:
# don't set up anything, named logging will effectively evaporate
return
assert isinstance(log_level, int) and log_level != 0, "LOG_LEVEL must be set"
assert isinstance(log_path, Path), "LOG_PATH must be a Path object"
assert isinstance(log_name, str) and log_name.strip() != "", "LOG_NAME must be a non-empty string"
assert all(c.isalpha() or c in "-_" for c in log_name), "LOG_NAME must contain only A-Z, a-z, hyphens, and underscores"
# handle custom log paths differently, don't generate directories
if ".mcp_server_webcrawl" in str(log_path):
log_path.parent.mkdir(parents=True, exist_ok=True)
else:
assert log_path.parent.exists() and log_path.parent.is_dir(), \
f"Custom parent directory `{log_path.parent}` does not exist or is not a directory"
logging.basicConfig(filename=str(log_path), filemode="w", level=log_level,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S", encoding="utf-8")
logger: logging.Logger = logging.getLogger(log_name)
# just set a few ops back, concurrent logger might not be ready
if log_level <= logging.INFO:
logger.info("🖥️ starting webcrawl MCP server")
log_extra: str = "(Debug is True)" if DEBUG else ""
logger.info(f"log level set to {log_level} {log_extra}")
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/tests.py:
--------------------------------------------------------------------------------
```python
from mcp_server_webcrawl.crawlers.warc.crawler import WarcCrawler
from mcp_server_webcrawl.crawlers.warc.adapter import WarcManager
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger
EXAMPLE_WARC_ID: int = WarcManager.string_to_id("example.warc.gz")
PRAGMAR_WARC_ID: int = WarcManager.string_to_id("pragmar.warc.gz")
logger = get_logger()
class WarcTests(BaseCrawlerTests):
"""
Test suite for the WARC crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests.
"""
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "warc"
def test_warc_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = WarcCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
def test_warc_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = WarcCrawler(self._datasrc)
self.run_pragmar_site_tests(crawler, PRAGMAR_WARC_ID)
def test_warc_search(self):
"""
Test boolean search functionality
"""
crawler = WarcCrawler(self._datasrc)
self.run_pragmar_search_tests(crawler, PRAGMAR_WARC_ID)
def test_warc_resources(self):
"""
Test resource retrieval API functionality with various parameters.
"""
crawler = WarcCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_WARC_ID, EXAMPLE_WARC_ID)
# pragmar WARC fixture legit contains no images
# may be default behavior of wget WARC gen, not sure
# this is a blind spot
# def test_interrobot_images(self):
# """
# Test InterroBot-specific image handling and thumbnails.
# """
# crawler = WarcCrawler(self._datasrc)
# self.run_pragmar_image_tests(crawler, PRAGMAR_WARC_ID)
def test_warc_sorts(self):
"""
Test random sort functionality using the '?' sort parameter.
"""
crawler = WarcCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_WARC_ID)
def test_warc_content_parsing(self):
"""
Test content type detection and parsing for WARC files.
"""
crawler = WarcCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_WARC_ID, True)
def test_report(self):
"""
Run test report, save to data directory.
"""
crawler = WarcCrawler(self._datasrc)
logger.info(self.run_pragmar_report(crawler, PRAGMAR_WARC_ID, "WARC"))
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/tests.py:
--------------------------------------------------------------------------------
```python
from logging import Logger
from mcp_server_webcrawl.crawlers.katana.crawler import KatanaCrawler
from mcp_server_webcrawl.crawlers.katana.adapter import KatanaManager
from mcp_server_webcrawl.crawlers.base.adapter import SitesGroup
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger
# calculate ids for test directories using the same hash function as adapter
EXAMPLE_SITE_ID = KatanaManager.string_to_id("example.com")
PRAGMAR_SITE_ID = KatanaManager.string_to_id("pragmar.com")
logger: Logger = get_logger()
class KatanaTests(BaseCrawlerTests):
"""
test suite for the HTTP text crawler implementation.
tests parsing and retrieval of web content from HTTP text files.
"""
def setUp(self):
"""
set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "katana"
def test_katana_pulse(self):
"""
basic crawler initialization.
"""
crawler = KatanaCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
def test_katana_sites(self):
"""
site retrieval API functionality.
"""
crawler = KatanaCrawler(self._datasrc)
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
def test_katana_search(self):
"""
boolean search tests
"""
crawler = KatanaCrawler(self._datasrc)
self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
def test_pragmar_tokenizer(self):
"""
tokenizer search tests
"""
crawler = KatanaCrawler(self._datasrc)
self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
def test_katana_resources(self):
"""
resource retrieval API functionality with various parameters.
"""
crawler = KatanaCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
def test_interrobot_images(self):
"""
Test InterroBot-specific image handling and thumbnails.
"""
crawler = KatanaCrawler(self._datasrc)
self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
def test_katana_sorts(self):
"""
random sort functionality using the '?' sort parameter.
"""
crawler = KatanaCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
def test_katana_content_parsing(self):
"""
content type detection and parsing for HTTP text files.
"""
crawler = KatanaCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
def test_report(self):
"""
Run test report, save to data directory.
"""
crawler = KatanaCrawler(self._datasrc)
logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "Katana"))
```
--------------------------------------------------------------------------------
/docs/_sources/index.rst.txt:
--------------------------------------------------------------------------------
```
.. image:: _static/images/mcpswc.svg
:alt: mcp-server-webcrawl heading
:align: center
:width: 100%
.. raw:: html
<div style="text-align: center; margin-bottom: 2em;">
<a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 4px;">Website</a> |
<a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 4px;">Github</a> |
<a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 4px;">Docs</a> |
<a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 4px;">PyPi</a>
</div>
mcp-server-webcrawl
===============================================
Advanced search and retrieval for web crawler data. With **mcp-server-webcrawl**, your AI client filters
and analyzes web content under your direction or autonomously. The server includes a full-text search
interface with boolean support, and resource filtering by type, HTTP status, and more.
**mcp-server-webcrawl** provides the LLM a complete menu with which to search your web content, and works with
a variety of web crawlers:
.. list-table:: Supported Crawlers
:header-rows: 1
:widths: 30 50 20
* - Crawler/Format
- Description
- Setup Guide
* - `ArchiveBox <https://archivebox.io>`_
- Self-hosted web archiving tool
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html>`_
* - `HTTrack <https://www.httrack.com>`_
- GUI/CLI website mirroring tool
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html>`_
* - `InterroBot <https://interro.bot>`_
- GUI crawler and analyzer
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html>`_
* - `Katana <https://github.com/projectdiscovery/katana>`_
- CLI security-focused crawler
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html>`_
* - `SiteOne <https://crawler.siteone.io>`_
- GUI crawler and analyzer
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html>`_
* - `WARC <https://en.wikipedia.org/wiki/WARC_(file_format)>`_
- Standard web archive format
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html>`_
* - `wget <https://en.wikipedia.org/wiki/Wget>`_
- CLI website mirroring tool
- `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html>`_
**mcp-server-webcrawl** is free and open source, and requires Claude Desktop, Python (>=3.10). It is installed on the command line, via pip install:
.. code-block:: bash
pip install mcp-server-webcrawl
.. raw:: html
<iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/Sid-GBxII1o" frameborder="0" allowfullscreen></iframe>
.. toctree::
:maxdepth: 1
:caption: Contents:
installation
guides
usage
prompts
interactive
modules
Indices and tables
------------------
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/__init__.py:
--------------------------------------------------------------------------------
```python
import os
import sys
import asyncio
import tracemalloc
import unittest
import logging
from pathlib import Path
from argparse import ArgumentParser
from mcp_server_webcrawl.utils.cli import get_help_short_message, get_help_long_message
from mcp_server_webcrawl.settings import DEBUG, DATA_DIRECTORY, FIXTURES_DIRECTORY
from mcp_server_webcrawl.crawlers import get_crawler, VALID_CRAWLER_CHOICES
__version__: str = "0.15.0"
__name__: str = "mcp-server-webcrawl"
if DEBUG:
tracemalloc.start()
class CustomHelpArgumentParser(ArgumentParser):
def print_help(self, file=None):
print(get_help_long_message(__version__))
def main() -> None:
"""
Main entry point for the package. mcp-server-webcrawl should be on path if pip installed
"""
if len(sys.argv) == 1:
# \n parser error follows short message
sys.stderr.write(get_help_short_message(__version__) + "\n")
parser: CustomHelpArgumentParser = CustomHelpArgumentParser(description="InterrBot MCP Server")
parser.add_argument("-c", "--crawler", type=str, choices=VALID_CRAWLER_CHOICES,
help="Specify which crawler to use (default: interrobot)")
parser.add_argument("--run-tests", action="store_true", help="Run tests instead of server")
parser.add_argument("-i", "--interactive", action="store_true", help="Run interactive terminal search mode")
parser.add_argument("-d", "--datasrc", type=str, help="Path to datasrc (required unless testing)")
args = parser.parse_args()
if args.run_tests:
# Check if FIXTURES_DIRECTORY is configured and exists
if FIXTURES_DIRECTORY is None or not FIXTURES_DIRECTORY.exists() or not FIXTURES_DIRECTORY.is_dir():
sys.stderr.write(f"Fixtures not configured in settings_local.py, or is not a valid directory.\nFIXTURES_DIRECTORY: {FIXTURES_DIRECTORY}")
sys.exit(1)
# testing captures some cross-fixture file information, useful for debug
# force=True gets this to write during tests (usually quieted during run)
unittest_log: Path = DATA_DIRECTORY / "fixtures-report.log"
logging.basicConfig(level=logging.INFO, filename=unittest_log, filemode='w', force=True)
file_directory = os.path.dirname(os.path.abspath(__file__))
sys.exit(unittest.main(module=None, argv=["", "discover", "-s", file_directory, "-p", "*test*.py"]))
if args.interactive:
from mcp_server_webcrawl.interactive.session import InteractiveSession
intersession = InteractiveSession(args.crawler, args.datasrc)
intersession.run()
sys.exit(0)
if not args.datasrc:
parser.error("the -d/--datasrc argument is required when not in test mode")
if not args.crawler or args.crawler.lower() not in VALID_CRAWLER_CHOICES:
valid_crawlers = ", ".join(VALID_CRAWLER_CHOICES)
parser.error(f"the -c/--crawler argument must be one of: {valid_crawlers}")
# cli interaction prior to loading the server
from mcp_server_webcrawl.main import main as mcp_main
crawler = get_crawler(args.crawler)
asyncio.run(mcp_main(crawler, Path(args.datasrc)))
__all__ = ["main"]
```
--------------------------------------------------------------------------------
/docs/_sources/guides/wget.rst.txt:
--------------------------------------------------------------------------------
```
wget MCP Setup Guide
====================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with
`wget <https://en.wikipedia.org/wiki/Wget>`_.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/uqEEqVsofhc" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- Basic familiarity with command line interfaces
- wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package:
.. code-block:: bash
pip install mcp-server-webcrawl
Verify installation was successful by checking the version:
.. code-block:: bash
mcp-server-webcrawl --help
2. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "wget", "--datasrc",
"/path/to/wget/archives/"]
}
}
}
.. note::
- On Windows, use ``"mcp-server-webcrawl"`` as the command
- On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
- Change ``/path/to/wget/archives/`` to your actual directory path
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
3. Crawl Websites with wget
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Terminal (macOS) or Ubuntu/WSL (Windows)
2. Navigate to your target directory for storing crawls
3. Run wget with the mirror option:
.. code-block:: bash
wget --mirror https://example.com
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see an MCP tool option under Search and Tools
2. Ask Claude to list your crawled sites:
.. code-block:: text
Can you list the crawled sites available?
3. Try searching content from your crawls:
.. code-block:: text
Can you find information about [topic] on [crawled site]?
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed, and on PATH or using absolute paths
- Check that your crawl directory path in the configuration is correct
- Remember that the first time you use a function, Claude will ask for permission
- Indexing for file-based archives (wget included) requires build time on first search, time is dependent on archive size
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/docs/_static/css/badge_only.css:
--------------------------------------------------------------------------------
```css
.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions .rst-other-versions .rtd-current-item{font-weight:700}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}#flyout-search-form{padding:6px}
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/crawler.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from mcp.types import Tool
from mcp_server_webcrawl.models.sites import SiteResult
from mcp_server_webcrawl.models.resources import (
RESOURCES_FIELDS_DEFAULT,
RESOURCES_FIELDS_BASE,
RESOURCES_DEFAULT_SORT_MAPPING,
RESOURCES_FIELDS_OPTIONS,
)
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.crawlers.interrobot.adapter import (
get_sites,
get_resources,
INTERROBOT_RESOURCE_FIELD_MAPPING,
INTERROBOT_SITE_FIELD_MAPPING,
INTERROBOT_SITE_FIELD_REQUIRED,
)
from mcp_server_webcrawl.utils.tools import get_crawler_tools
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class InterroBotCrawler(BaseCrawler):
"""
A crawler implementation for InterroBot data sources.
Provides functionality for accessing and searching web content from InterroBot.
"""
def __init__(
self,
datasrc: Path,
) -> None:
"""
Initialize the InterroBotCrawler with a data source path and required adapter functions.
Args:
datasrc: Path to the data source
"""
super().__init__(datasrc, get_sites, get_resources, resource_field_mapping=INTERROBOT_RESOURCE_FIELD_MAPPING)
assert datasrc.is_file() and datasrc.suffix == ".db", f"{self.__class__.__name__} datasrc must be a db file"
async def mcp_list_tools(self) -> list[Tool]:
"""
List available tools for this crawler.
Returns:
List of Tool objects
"""
# get the default crawler tools, then override necessary fields
all_sites: list[SiteResult] = self._adapter_get_sites(self._datasrc)
all_sites_ids: list[int] = [s.id for s in all_sites if s is not None and isinstance(s.id, int)]
default_tools: list[Tool] = get_crawler_tools(sites=all_sites)
assert len(default_tools) == 2, "expected exactly 2 Tools: sites and resources"
# can replace get_crawler_tools or extend, here it is overwritten from default
# you'd think maybe pass changes in, but no, it's better ad hoc
default_sites_tool: Tool
default_resources_tool: Tool
default_sites_tool, default_resources_tool = default_tools
sites_field_options: list[str] = list(set(INTERROBOT_SITE_FIELD_MAPPING.keys()) - set(INTERROBOT_SITE_FIELD_REQUIRED))
dst_props: dict = default_sites_tool.inputSchema["properties"]
dst_props["fields"]["items"]["enum"] = sites_field_options
resources_sort_options: list[str] = list(RESOURCES_DEFAULT_SORT_MAPPING.keys())
all_sites_display: str = ", ".join([f"{s.name} (site: {s.id})" for s in all_sites])
drt_props: dict = default_resources_tool.inputSchema["properties"]
drt_props["fields"]["items"]["enum"] = RESOURCES_FIELDS_OPTIONS
drt_props["sort"]["enum"] = resources_sort_options
drt_props["sites"]["items"]["enum"] = all_sites_ids
drt_props["sites"]["description"] = ("Optional "
"list of project ID to filter search results to a specific site. In 95% "
"of scenarios, you'd filter to only one site, but many site filtering is offered "
f"for advanced search scenarios. Available sites include {all_sites_display}.")
return [default_sites_tool, default_resources_tool]
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/parsetab.py:
--------------------------------------------------------------------------------
```python
# parsetab.py
# This file is automatically generated. Do not edit.
# pylint: disable=W,C,R
_tabversion = '3.10'
_lr_method = 'LALR'
_lr_signature = 'rightNOTleftANDleftORAND COLON COMPARATOR COMP_OP FIELD LPAREN NOT OR QUOTED_STRING RPAREN TERM URL_FIELD WILDCARD\n query : expression\n \n expression : expression AND expression\n | expression OR expression\n | expression NOT expression\n \n expression : NOT expression\n \n expression : LPAREN expression RPAREN\n \n expression : URL_FIELD\n \n value : TERM\n | WILDCARD\n | QUOTED_STRING\n \n expression : value\n \n expression : FIELD COLON COMP_OP value\n | FIELD COLON value\n | FIELD COMPARATOR value\n '
_lr_action_items = {'NOT':([0,2,3,4,5,6,8,9,10,11,12,13,14,15,18,19,20,21,23,24,25,],[3,13,3,3,-7,-11,-8,-9,-10,3,3,3,13,13,-2,-3,13,-6,-13,-14,-12,]),'LPAREN':([0,3,4,11,12,13,],[4,4,4,4,4,4,]),'URL_FIELD':([0,3,4,11,12,13,],[5,5,5,5,5,5,]),'FIELD':([0,3,4,11,12,13,],[7,7,7,7,7,7,]),'TERM':([0,3,4,11,12,13,16,17,22,],[8,8,8,8,8,8,8,8,8,]),'WILDCARD':([0,3,4,11,12,13,16,17,22,],[9,9,9,9,9,9,9,9,9,]),'QUOTED_STRING':([0,3,4,11,12,13,16,17,22,],[10,10,10,10,10,10,10,10,10,]),'$end':([1,2,5,6,8,9,10,14,18,19,20,21,23,24,25,],[0,-1,-7,-11,-8,-9,-10,-5,-2,-3,-4,-6,-13,-14,-12,]),'AND':([2,5,6,8,9,10,14,15,18,19,20,21,23,24,25,],[11,-7,-11,-8,-9,-10,11,11,-2,-3,11,-6,-13,-14,-12,]),'OR':([2,5,6,8,9,10,14,15,18,19,20,21,23,24,25,],[12,-7,-11,-8,-9,-10,12,12,12,-3,12,-6,-13,-14,-12,]),'RPAREN':([5,6,8,9,10,14,15,18,19,20,21,23,24,25,],[-7,-11,-8,-9,-10,-5,21,-2,-3,-4,-6,-13,-14,-12,]),'COLON':([7,],[16,]),'COMPARATOR':([7,],[17,]),'COMP_OP':([16,],[22,]),}
_lr_action = {}
for _k, _v in _lr_action_items.items():
for _x,_y in zip(_v[0],_v[1]):
if not _x in _lr_action: _lr_action[_x] = {}
_lr_action[_x][_k] = _y
del _lr_action_items
_lr_goto_items = {'query':([0,],[1,]),'expression':([0,3,4,11,12,13,],[2,14,15,18,19,20,]),'value':([0,3,4,11,12,13,16,17,22,],[6,6,6,6,6,6,23,24,25,]),}
_lr_goto = {}
for _k, _v in _lr_goto_items.items():
for _x, _y in zip(_v[0], _v[1]):
if not _x in _lr_goto: _lr_goto[_x] = {}
_lr_goto[_x][_k] = _y
del _lr_goto_items
_lr_productions = [
("S' -> query","S'",1,None,None,None),
('query -> expression','query',1,'p_query','search.py',317),
('expression -> expression AND expression','expression',3,'p_expression_binary','search.py',403),
('expression -> expression OR expression','expression',3,'p_expression_binary','search.py',404),
('expression -> expression NOT expression','expression',3,'p_expression_binary','search.py',405),
('expression -> NOT expression','expression',2,'p_expression_not','search.py',462),
('expression -> LPAREN expression RPAREN','expression',3,'p_expression_group','search.py',477),
('expression -> URL_FIELD','expression',1,'p_expression_url_field','search.py',483),
('value -> TERM','value',1,'p_value','search.py',503),
('value -> WILDCARD','value',1,'p_value','search.py',504),
('value -> QUOTED_STRING','value',1,'p_value','search.py',505),
('expression -> value','expression',1,'p_expression_term','search.py',519),
('expression -> FIELD COLON COMP_OP value','expression',4,'p_expression_field_search','search.py',532),
('expression -> FIELD COLON value','expression',3,'p_expression_field_search','search.py',533),
('expression -> FIELD COMPARATOR value','expression',3,'p_expression_field_search','search.py',534),
]
```
--------------------------------------------------------------------------------
/docs/_sources/guides/siteone.rst.txt:
--------------------------------------------------------------------------------
```
SiteOne MCP Setup Guide
=======================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with SiteOne crawler.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled using SiteOne.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/JOGRYbo6WwI" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide below.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- `SiteOne Crawler <https://crawler.siteone.io>`_ installed
- Basic familiarity with command line interfaces
What is SiteOne?
----------------
SiteOne is a GUI crawler that offers:
- User-friendly desktop interface for setting up and managing crawls
- Offline website generation capabilities
- Comprehensive crawl reporting
- Intuitive controls for non-technical users
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package::
pip install mcp-server-webcrawl
Verify installation was successful::
mcp-server-webcrawl --help
2. Create Crawls with SiteOne
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open SiteOne Crawler application
2. Enter a URL to crawl (e.g., example.com)
3. **Important**: Check the "Generate offline website" option (this is required for MCP integration)
4. Click the start button to begin crawling
5. Repeat for additional sites as needed (e.g., pragmar.com)
6. Note the directory where SiteOne is storing the generated offline content (this is shown in the application)
3. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "siteone", "--datasrc",
"/path/to/siteone/archives/"]
}
}
}
.. note::
- On Windows, use ``"mcp-server-webcrawl"`` as the command
- On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
- Change ``/path/to/siteone/archives/`` to the actual path where SiteOne stores offline website content
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see MCP tools available under Search and Tools
2. Ask Claude to list your crawled sites::
Can you list the crawled sites available?
3. Try searching content from your crawls::
Can you find information about [topic] on [crawled site]?
4. Explore specific topics on your crawled sites::
I'm interested in [keyword] in [crawled domain]. Can you tell me about it?
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed
- Check that your SiteOne archives path in the configuration is correct
- Make sure the "Generate offline website" option was checked when creating crawls
- Verify that each crawl completed successfully and files were saved to the expected location
- Remember that the first time you use a function, Claude will ask for permission
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/docs/_sources/guides/warc.rst.txt:
--------------------------------------------------------------------------------
```
WARC MCP Setup Guide
====================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with
`WARC <https://en.wikipedia.org/wiki/WARC_\(file_format\)>`_ files to allow your LLM (e.g.
Claude Desktop) to search content and metadata from websites you've archived in WARC format.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/fx-4WZu-UT8" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide below.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- Basic familiarity with command line interfaces
- wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)
What are WARC Files?
--------------------
WARC files are single-file archives that store complete crawl data including:
- HTTP status codes
- HTTP headers
- Response content
Compared to wget running in mirror mode:
- **WARC**: More comprehensive (preserves status codes and headers) but slower crawling
- **wget mirror**: Faster crawling but doesn't preserve status codes or headers
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package::
pip install mcp-server-webcrawl
Verify installation was successful::
mcp-server-webcrawl --help
2. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "warc", "--datasrc",
"/path/to/warc/archives/"]
}
}
}
.. note::
- On Windows, use ``"mcp-server-webcrawl"`` as the command
- On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
- Change ``/path/to/warc/archives/`` to your actual directory path where WARC files are stored
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
3. Create WARC Files with Wget
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Terminal (macOS) or Ubuntu/WSL (Windows)
2. Navigate to your target directory for storing WARC files
3. Run wget with WARC options:
.. code-block:: bash
# basic WARC capture
wget --warc-file=example --recursive https://example.com
# more comprehensive capture with page requirements (CSS, images, etc.)
wget --warc-file=example --recursive --page-requisites https://example.com
Your WARC files will be created with a .warc.gz extension in your current directory.
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see MCP tools available under Search and Tools
2. Ask Claude to list your crawled sites::
Can you list the crawled sites available?
3. Try searching content from your crawls::
Can you find information about [topic] on [crawled site]?
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed
- Check that your WARC directory path in the configuration is correct
- Make sure your WARC files have the correct extension (typically .warc.gz)
- Remember that the first time you use each function, Claude will ask for permission
- For large WARC files, initial indexing may take some time
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/regex.py:
--------------------------------------------------------------------------------
```python
import re
from functools import lru_cache
from typing import Final
from logging import Logger
from mcp_server_webcrawl.utils.logger import get_logger
__REGEX_PATTERNS_REGEX_HAZARDS: Final[list[str]] = [
r"\([^)]*\*[^)]*\+", # (.*)*+, (.+)*+, etc.
r"\([^)]*\+[^)]*\*", # (.+)*., (.*)++, etc.
r"\([^)]*\+[^)]*\+", # (.+)+, (.++)+ etc.
r"\([^)]*\*[^)]*\*", # (.*)*, (.**) etc.
r"\.\*.*\.\*", # .*.* patterns
r"\.\+.*\.\+", # .+.+ patterns
r"\([^)]*\?\)\*", # (a?)* patterns
r"\([^)]*\?\)\+", # (a?)+ patterns
r"\([^)]*[*+?][^)]*[*+?][^)]*\)[*+]", # 2+ quantifiers inside, then quantifier outside
]
logger: Logger = get_logger()
@lru_cache(maxsize=None)
def __get_compiled_hazard_patterns():
"""
Lazy load compiled patterns
"""
compiled_patterns = []
for hazard in __REGEX_PATTERNS_REGEX_HAZARDS:
try:
compiled_patterns.append(re.compile(hazard))
except re.error as ex:
logger.warning(f"Invalid hazard pattern {hazard}: {ex}")
continue
return compiled_patterns
def __regex_is_hazardous(pattern: str) -> bool:
"""
Check if a regex pattern might cause catastrophic backtracking
or otherwise unacceptable performance over up to 100 HTML files
"""
compiled_hazards = __get_compiled_hazard_patterns()
for hazard_pattern in compiled_hazards:
try:
if hazard_pattern.search(pattern):
logger.error(f"hazardous regex discarded {pattern} matched {hazard_pattern.pattern}")
return True
except re.error as ex:
logger.warning(f"Error checking hazard pattern {hazard_pattern.pattern}: {ex}")
continue
return False
def get_regex(headers: str, content: str, patterns: list[str]) -> list[dict[str, str | int]]:
"""
Takes headers and content and gets regex matches
Arguments:
headers: The headers to search
content: The content to search
patterns: The regex patterns
Returns:
A list of dicts, with selector, value, groups, position info, and source
"""
if not isinstance(content, str):
content = ""
if not isinstance(headers, str):
headers = ""
if not isinstance(patterns, list) or not all(isinstance(item, str) for item in patterns):
raise ValueError("patterns must be a list of strings")
results = []
if content == "" and headers == "":
return results
re_patterns = []
for pattern in patterns:
if __regex_is_hazardous(pattern):
logger.warning(f"Hazardous regex pattern '{pattern}'")
continue
try:
re_pattern = re.compile(pattern)
re_patterns.append(re_pattern)
except re.error as ex:
logger.warning(f"Invalid regex pattern '{pattern}': {ex}")
continue
# search headers and content
search_targets = [("headers", headers), ("content", content)]
for re_pattern in re_patterns:
for source_name, search_text in search_targets:
if not search_text:
continue
for match in re_pattern.finditer(search_text):
regex_hit: dict[str, str | int] = {
"selector": re_pattern.pattern,
"value": match.group(0),
"source": source_name # headers or content
}
if match.groups():
for i, group in enumerate(match.groups(), 1):
if group is not None:
regex_hit[f"group_{i}"] = group
regex_hit["start"] = match.start()
regex_hit["end"] = match.end()
results.append(regex_hit)
return results
```
--------------------------------------------------------------------------------
/docs/_sources/guides/interrobot.rst.txt:
--------------------------------------------------------------------------------
```
InterroBot MCP Setup Guide
==========================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with InterroBot.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled with InterroBot.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/55y8oKWXJLs" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide below.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- `InterroBot <https://interro.bot>`_ installed
- Basic familiarity with command line interfaces
What is InterroBot?
-------------------
InterroBot is a commercial web crawler and analyzer that works seamlessly with mcp-server-webcrawl, providing several advantages:
- User-friendly graphical interface for managing crawls
- Comprehensive data collection including page content and metadata
- Natively indexed, no first search build lag
- Cross-platform \(Windows, macOS, Android\)
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package::
pip install mcp-server-webcrawl
Verify installation was successful::
mcp-server-webcrawl --help
2. Create Crawls with InterroBot
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open InterroBot
2. For a new project, you'll see an empty project screen
3. Add websites to crawl by entering URLs (e.g., example.com, pragmar.com)
4. Wait for the crawling to complete (typically takes a few seconds to minutes depending on site size)
5. Note the location of your InterroBot database file, which will be needed for configuration. It is available in InterroBot options, under Advanced section:
- On Windows: Typically in ``[homedir]/Documents/InterroBot/interrobot.v2.db``
- On macOS: Path can be found in InterroBot settings page
3. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "interrobot", "--datasrc",
"[homedir]/Documents/InterroBot/interrobot.v2.db"]
}
}
}
.. note::
- On Windows, use ``"mcp-server-webcrawl"`` as the command
- On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
- Replace ``[homedir]/Documents/InterroBot/interrobot.v2.db`` with the actual path to your InterroBot database file, available in InterroBot options
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see MCP tools available under Search and Tools
2. Ask Claude to list your crawled sites::
Can you list the crawled sites available?
3. Try searching content from your crawls::
Can you find information about [topic] on [crawled site]?
4. Explore specific capabilities, such as generating site reports::
Can you give me a file type summary for [crawled site]? Which types of files are there, page count, etc.
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed
- Check that your InterroBot database path in the configuration is correct
- Make sure InterroBot has successfully completed crawling the websites
- Remember that the first time you use a function, Claude will ask for permission
- For large websites with many pages, search queries might take longer to process initially
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/templates/tests.py:
--------------------------------------------------------------------------------
```python
import re
import unittest
from importlib import resources
from urllib.request import urlopen
from mcp_server_webcrawl.utils.logger import get_logger
from mcp_server_webcrawl.extras.markdown import get_markdown
logger = get_logger()
class TemplateTests(unittest.TestCase):
"""
Test suite for the custom HTML to markdown converter.
Why custom? It's a bit faster, that is the only reason.
Maximum load is 100 transforms (1 per result for a max result
of 100), so speed matters. A default set is 20.
This converter does a few things differently to tailor to LLM
interaction.
* aggressively removes images (html2text selectively renders)
* links with block decendents will render like a <p>
(html2text treats as <a><br>)
"""
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
def test_core_html(self):
core_html: str = resources.read_text("mcp_server_webcrawl.templates", "tests_core.html")
markdown = get_markdown(core_html)
# h1-6
self.assertIn("# Lorem Ipsum Dolor Sit Amet", markdown)
self.assertIn("## Consectetur Adipiscing Elit", markdown)
self.assertIn("### Nemo Enim Ipsam Voluptatem", markdown)
self.assertIn("#### Sed Quia Non Numquam", markdown)
self.assertIn("##### Nisi Ut Aliquid Ex Ea", markdown)
self.assertIn("###### At Vero Eos Et Accusamus", markdown)
# no content loss - key phrases should be preserved
self.assertIn("Lorem ipsum dolor sit amet", markdown)
self.assertIn("Definition List Example", markdown)
self.assertIn("More Text Elements", markdown)
# inline formatting (proper spacing)
self.assertIn("amet, **consectetur adipiscing elit**. Sed", markdown)
self.assertIn("laborum. **Sed ut perspiciatis** unde", markdown)
self.assertIn("consequat. *Duis aute irure dolor* in", markdown)
self.assertIn("laudantium. *Totam rem aperiam*, eaque", markdown)
# link formatting (proper spacing)
self.assertIn("veniam, quis nostrud exercitation ullamco", markdown) # Fragment links as plain text
self.assertIn("and a link back to top. Nam", markdown)
# list formatting
self.assertIn("* Similique sunt in culpa", markdown)
self.assertIn("1. Temporibus autem quibusdam", markdown)
# dl/dt
self.assertIn("**Lorem Ipsum**", markdown)
self.assertIn(" Dolor sit amet, consectetur adipiscing elit", markdown)
self.assertIn("**Ut Enim**", markdown)
self.assertIn(" Ad minim veniam, quis nostrud exercitation", markdown)
self.assertIn("**Duis Aute**", markdown)
self.assertIn(" Irure dolor in reprehenderit in voluptate", markdown)
# table structure
self.assertIn("| Lorem | Ipsum | Dolor | Sit |", markdown)
self.assertIn("|---|---|---|---|", markdown)
self.assertIn("| Consectetur | Adipiscing | Elit | Sed |", markdown)
# code formatting
self.assertIn("Here we have some `inline code` and", markdown)
self.assertIn("```\nfunction lorem() {\n return \"ipsum dolor sit amet\";\n}\n```", markdown)
# blockquotes
self.assertIn("> \"Sed ut perspiciatis unde omnis iste natus", markdown)
# horizontal rule
self.assertIn("---", markdown)
# no double spacing for inline elements
self.assertNotIn("** ", markdown) # No double spaces after bold
self.assertNotIn(" **", markdown) # No double spaces before bold
self.assertNotIn("* ", markdown) # No double spaces after emphasis
self.assertNotIn(" *", markdown) # No double spaces before emphasis
# structural integrity - count major elements
heading_count = len(re.findall(r"^#{1,6} ", markdown, re.MULTILINE))
self.assertEqual(heading_count, 11, "Should have exactly 6 headings")
table_count = len(re.findall(r"^\|.*\|$", markdown, re.MULTILINE))
self.assertGreater(table_count, 5, "Should have multiple table rows")
```
--------------------------------------------------------------------------------
/docs/_sources/prompts.rst.txt:
--------------------------------------------------------------------------------
```
Prompt Routines
===============
**mcp-server-webcrawl** provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.
It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are **copy and paste**, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.
If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type "run pasted for [site name or URL]." It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
| Prompt | Download | Category | Description |
+===========================+===================+============+=========================================================================+
| 🔍 **SEO Audit** | `auditseo.md`_ | audit | Technical SEO (search engine optimization) analysis. Covers the |
| | | | basics, with options to dive deeper. |
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
| 🔗 **404 Audit** | `audit404.md`_ | audit | Broken link detection and pattern analysis. Not only finds issues, |
| | | | but suggests fixes. |
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
| ⚡ **Performance Audit** | `auditperf.md`_ | audit | Website speed and optimization analysis. Real talk. |
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
| 📁 **File Audit** | `auditfiles.md`_ | audit | File organization and asset analysis. Discover the composition of |
| | | | your website. |
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
| 🌐 **Gopher Interface** | `gopher.md`_ | interface | An old-fashioned search interface inspired by the Gopher clients of |
| | | | yesteryear. |
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
| ⚙️ **Search Test** | `testsearch.md`_ | self-test | A battery of tests to check for Boolean logical inconsistencies in |
| | | | the search query parser and subsequent FTS5 conversion. |
+---------------------------+-------------------+------------+-------------------------------------------------------------------------+
.. _auditseo.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md
.. _audit404.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md
.. _auditperf.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md
.. _auditfiles.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md
.. _gopher.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md
.. _testsearch.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md
```
--------------------------------------------------------------------------------
/docs/_static/_sphinx_javascript_frameworks_compat.js:
--------------------------------------------------------------------------------
```javascript
/* Compatability shim for jQuery and underscores.js.
*
* Copyright Sphinx contributors
* Released under the two clause BSD licence
*/
/**
* small helper function to urldecode strings
*
* See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL
*/
jQuery.urldecode = function(x) {
if (!x) {
return x
}
return decodeURIComponent(x.replace(/\+/g, ' '));
};
/**
* small helper function to urlencode strings
*/
jQuery.urlencode = encodeURIComponent;
/**
* This function returns the parsed url parameters of the
* current request. Multiple values per key are supported,
* it will always return arrays of strings for the value parts.
*/
jQuery.getQueryParameters = function(s) {
if (typeof s === 'undefined')
s = document.location.search;
var parts = s.substr(s.indexOf('?') + 1).split('&');
var result = {};
for (var i = 0; i < parts.length; i++) {
var tmp = parts[i].split('=', 2);
var key = jQuery.urldecode(tmp[0]);
var value = jQuery.urldecode(tmp[1]);
if (key in result)
result[key].push(value);
else
result[key] = [value];
}
return result;
};
/**
* highlight a given string on a jquery object by wrapping it in
* span elements with the given class name.
*/
jQuery.fn.highlightText = function(text, className) {
function highlight(node, addItems) {
if (node.nodeType === 3) {
var val = node.nodeValue;
var pos = val.toLowerCase().indexOf(text);
if (pos >= 0 &&
!jQuery(node.parentNode).hasClass(className) &&
!jQuery(node.parentNode).hasClass("nohighlight")) {
var span;
var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg");
if (isInSVG) {
span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
} else {
span = document.createElement("span");
span.className = className;
}
span.appendChild(document.createTextNode(val.substr(pos, text.length)));
node.parentNode.insertBefore(span, node.parentNode.insertBefore(
document.createTextNode(val.substr(pos + text.length)),
node.nextSibling));
node.nodeValue = val.substr(0, pos);
if (isInSVG) {
var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
var bbox = node.parentElement.getBBox();
rect.x.baseVal.value = bbox.x;
rect.y.baseVal.value = bbox.y;
rect.width.baseVal.value = bbox.width;
rect.height.baseVal.value = bbox.height;
rect.setAttribute('class', className);
addItems.push({
"parent": node.parentNode,
"target": rect});
}
}
}
else if (!jQuery(node).is("button, select, textarea")) {
jQuery.each(node.childNodes, function() {
highlight(this, addItems);
});
}
}
var addItems = [];
var result = this.each(function() {
highlight(this, addItems);
});
for (var i = 0; i < addItems.length; ++i) {
jQuery(addItems[i].parent).before(addItems[i].target);
}
return result;
};
/*
* backward compatibility for jQuery.browser
* This will be supported until firefox bug is fixed.
*/
if (!jQuery.browser) {
jQuery.uaMatch = function(ua) {
ua = ua.toLowerCase();
var match = /(chrome)[ \/]([\w.]+)/.exec(ua) ||
/(webkit)[ \/]([\w.]+)/.exec(ua) ||
/(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) ||
/(msie) ([\w.]+)/.exec(ua) ||
ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) ||
[];
return {
browser: match[ 1 ] || "",
version: match[ 2 ] || "0"
};
};
jQuery.browser = {};
jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true;
}
```
--------------------------------------------------------------------------------
/docs/search.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Search — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<script src="_static/searchtools.js"></script>
<script src="_static/language_data.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="#" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="#" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">Search</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<noscript>
<div id="fallback" class="admonition warning">
<p class="last">
Please activate JavaScript to enable the search functionality.
</p>
</div>
</noscript>
<div id="search-results">
</div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script>
jQuery(function() { Search.loadIndex("searchindex.js"); });
</script>
<script id="searchindexloader"></script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_static/doctools.js:
--------------------------------------------------------------------------------
```javascript
/*
* doctools.js
* ~~~~~~~~~~~
*
* Base JavaScript utilities for all Sphinx HTML documentation.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
"use strict";
const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
"TEXTAREA",
"INPUT",
"SELECT",
"BUTTON",
]);
const _ready = (callback) => {
if (document.readyState !== "loading") {
callback();
} else {
document.addEventListener("DOMContentLoaded", callback);
}
};
/**
* Small JavaScript module for the documentation.
*/
const Documentation = {
init: () => {
Documentation.initDomainIndexTable();
Documentation.initOnKeyListeners();
},
/**
* i18n support
*/
TRANSLATIONS: {},
PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
LOCALE: "unknown",
// gettext and ngettext don't access this so that the functions
// can safely bound to a different name (_ = Documentation.gettext)
gettext: (string) => {
const translated = Documentation.TRANSLATIONS[string];
switch (typeof translated) {
case "undefined":
return string; // no translation
case "string":
return translated; // translation exists
default:
return translated[0]; // (singular, plural) translation tuple exists
}
},
ngettext: (singular, plural, n) => {
const translated = Documentation.TRANSLATIONS[singular];
if (typeof translated !== "undefined")
return translated[Documentation.PLURAL_EXPR(n)];
return n === 1 ? singular : plural;
},
addTranslations: (catalog) => {
Object.assign(Documentation.TRANSLATIONS, catalog.messages);
Documentation.PLURAL_EXPR = new Function(
"n",
`return (${catalog.plural_expr})`
);
Documentation.LOCALE = catalog.locale;
},
/**
* helper function to focus on search bar
*/
focusSearchBar: () => {
document.querySelectorAll("input[name=q]")[0]?.focus();
},
/**
* Initialise the domain index toggle buttons
*/
initDomainIndexTable: () => {
const toggler = (el) => {
const idNumber = el.id.substr(7);
const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
if (el.src.substr(-9) === "minus.png") {
el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
toggledRows.forEach((el) => (el.style.display = "none"));
} else {
el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
toggledRows.forEach((el) => (el.style.display = ""));
}
};
const togglerElements = document.querySelectorAll("img.toggler");
togglerElements.forEach((el) =>
el.addEventListener("click", (event) => toggler(event.currentTarget))
);
togglerElements.forEach((el) => (el.style.display = ""));
if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
},
initOnKeyListeners: () => {
// only install a listener if it is really needed
if (
!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
)
return;
document.addEventListener("keydown", (event) => {
// bail for input elements
if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
// bail with special keys
if (event.altKey || event.ctrlKey || event.metaKey) return;
if (!event.shiftKey) {
switch (event.key) {
case "ArrowLeft":
if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
const prevLink = document.querySelector('link[rel="prev"]');
if (prevLink && prevLink.href) {
window.location.href = prevLink.href;
event.preventDefault();
}
break;
case "ArrowRight":
if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
const nextLink = document.querySelector('link[rel="next"]');
if (nextLink && nextLink.href) {
window.location.href = nextLink.href;
event.preventDefault();
}
break;
}
}
// some keyboard layouts may need Shift to get /
switch (event.key) {
case "/":
if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
Documentation.focusSearchBar();
event.preventDefault();
}
});
},
};
// quick alias for translations
const _ = Documentation.gettext;
_ready(Documentation.init);
```
--------------------------------------------------------------------------------
/docs/_sources/guides/httrack.rst.txt:
--------------------------------------------------------------------------------
```
HTTrack MCP Setup Guide
========================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with `HTTrack Website Copier <https://www.httrack.com/>`_.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've mirrored using HTTrack.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/HAVfvmrZjRk" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide below.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- `HTTrack Website Copier <https://www.httrack.com/>`_ installed
- Basic familiarity with command line interfaces
What is HTTrack?
----------------
HTTrack is a well-established open source website mirror tool that offers:
- Complete website mirroring with organized project directories
- User-friendly wizard-style interface for setup
- Comprehensive content capture including HTML, CSS, images, and other assets
- Ability to manage multiple site mirrors efficiently
- Cross-platform support (Windows, macOS, Linux)
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package::
pip install mcp-server-webcrawl
Verify installation was successful::
mcp-server-webcrawl --help
2. Create Website Mirrors with HTTrack
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open HTTrack Website Copier application
2. Create a new project (e.g., "example") and specify where to save it
3. Add the URL you want to mirror (e.g., https://example.com)
4. Use the wizard interface to configure your crawling options
5. Start the mirroring process and wait for completion
6. Repeat for additional sites as needed (e.g., create another project for pragmar.com)
HTTrack will create organized project directories under your specified location (typically "My Web Sites" on Windows or "websites" on macOS/Linux). Each project contains the complete website mirror with all HTML files, images, CSS, and other assets properly organized.
3. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "httrack", "--datasrc",
"/path/to/httrack/projects/"]
}
}
}
.. note::
- On macOS/Linux, use the absolute path (output of ``which mcp-server-webcrawl``), and the default path is typically ``"~/websites"``
- The datasrc path should point to your HTTrack project directory containing all your mirrored sites
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see MCP tools available under Search and Tools
2. Ask Claude to list your crawled sites::
Can you list the crawled sites available?
3. Try searching content from your crawls::
Can you find information about [topic] on [crawled site]?
4. Conduct content audits and SEO analysis::
Can you analyze the content structure and SEO elements for [crawled site]?
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed
- Check that your HTTrack project directory path in the configuration is correct
- Make sure HTTrack has successfully completed mirroring the websites and created the project directories
- Remember that the first time you use a function, Claude will ask for permission
- For large websites, initial indexing may take some time during the first search
HTTrack's project structure makes it easy to manage multiple site mirrors, and when combined with mcp-server-webcrawl, provides for content analysis, SEO audits, and searchable archives.
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/docs/_sources/guides/katana.rst.txt:
--------------------------------------------------------------------------------
```
Katana MCP Setup Guide
======================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with `Katana <https://github.com/projectdiscovery/katana>`_ crawler.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled using Katana.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/sOMaojm0R0Y" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide below.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- `Go programming language <https://go.dev/doc/install>`_ installed
- `Katana crawler <https://github.com/projectdiscovery/katana>`_ installed
- Basic familiarity with command line interfaces
What is Katana?
---------------
Katana is an open-source web crawler from Project Discovery that offers:
- Fast and efficient web crawling capabilities
- Command-line interface for flexibility and automation
- Highly configurable crawling parameters
- Ability to store complete HTTP responses for analysis
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package::
pip install mcp-server-webcrawl
Verify installation was successful::
mcp-server-webcrawl --help
2. Install and Run Katana
~~~~~~~~~~~~~~~~~~~~~~~~~
1. Verify Go is installed and on your PATH::
go version
2. Install Katana using Go::
go install github.com/projectdiscovery/katana/cmd/katana@latest
3. Create a directory for your crawls and run Katana with storage options::
# Create a directory for storing crawls
mkdir crawls
# Run Katana with storage options
katana -u https://example.com -store-response -store-response-dir archives/example.com/
4. Repeat for additional websites as needed::
katana -u https://pragmar.com -store-response -store-response-dir archives/pragmar.com/
In this case, the ./archives directory is the datasrc. The crawler will create
a separate host directory for each unique host within
the specified directory. This is consistent with the behavior of Katana,
example.com/example.com is expected. Sites with external dependencies will branch
out by origin host in the -store-response-dir, and continue to be searchable as a
singular site search.
3. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "katana", "--datasrc",
"/path/to/katana/crawls/"]
}
}
}
.. note::
- On Windows, use ``"mcp-server-webcrawl"`` as the command
- On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
- Change ``/path/to/katana/crawls/`` to the actual path where you stored your Katana crawls
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see MCP tools available under Search and Tools
2. Ask Claude to list your crawled sites::
Can you list the crawled sites available?
3. Try searching content from your crawls::
Can you find information about [topic] on [crawled site]?
4. Try specialized searches that use Katana's comprehensive data collection::
Can you find all the help pages on this site and tell me how they're different?
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed
- Check that your Katana crawls directory path in the configuration is correct
- Make sure the ``-store-response`` flag was used during crawling, as this is required to save content
- Verify that each crawl completed successfully and files were saved to the expected location
- Remember that the first time you use a function, Claude will ask for permission
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/docs/_static/language_data.js:
--------------------------------------------------------------------------------
```javascript
/*
* language_data.js
* ~~~~~~~~~~~~~~~~
*
* This script contains the language-specific data used by searchtools.js,
* namely the list of stopwords, stemmer, scorer and splitter.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
/* Non-minified version is copied as a separate JS file, is available */
/**
* Porter Stemmer
*/
var Stemmer = function() {
var step2list = {
ational: 'ate',
tional: 'tion',
enci: 'ence',
anci: 'ance',
izer: 'ize',
bli: 'ble',
alli: 'al',
entli: 'ent',
eli: 'e',
ousli: 'ous',
ization: 'ize',
ation: 'ate',
ator: 'ate',
alism: 'al',
iveness: 'ive',
fulness: 'ful',
ousness: 'ous',
aliti: 'al',
iviti: 'ive',
biliti: 'ble',
logi: 'log'
};
var step3list = {
icate: 'ic',
ative: '',
alize: 'al',
iciti: 'ic',
ical: 'ic',
ful: '',
ness: ''
};
var c = "[^aeiou]"; // consonant
var v = "[aeiouy]"; // vowel
var C = c + "[^aeiouy]*"; // consonant sequence
var V = v + "[aeiou]*"; // vowel sequence
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem
this.stemWord = function (w) {
var stem;
var suffix;
var firstch;
var origword = w;
if (w.length < 3)
return w;
var re;
var re2;
var re3;
var re4;
firstch = w.substr(0,1);
if (firstch == "y")
w = firstch.toUpperCase() + w.substr(1);
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w))
w = w.replace(re,"$1$2");
else if (re2.test(w))
w = w.replace(re2,"$1$2");
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w))
w = w + "e";
else if (re3.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
else if (re4.test(w))
w = w + "e";
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem))
w = stem + "i";
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step2list[suffix];
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step3list[suffix];
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem))
w = stem;
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem))
w = stem;
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
w = stem;
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y")
w = firstch.toLowerCase() + w.substr(1);
return w;
}
}
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/tests.py:
--------------------------------------------------------------------------------
```python
from mcp_server_webcrawl.crawlers.siteone.crawler import SiteOneCrawler
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.crawlers.siteone.adapter import SiteOneManager
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
# calculate using same hash function as adapter
EXAMPLE_SITE_ID = SiteOneManager.string_to_id("example.com")
PRAGMAR_SITE_ID = SiteOneManager.string_to_id("pragmar.com")
class SiteOneTests(BaseCrawlerTests):
"""
Test suite for the SiteOne crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests plus SiteOne-specific features.
"""
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "siteone"
def test_siteone_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = SiteOneCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
def test_siteone_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = SiteOneCrawler(self._datasrc)
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
def test_siteone_search(self):
"""
Test boolean search functionality
"""
crawler = SiteOneCrawler(self._datasrc)
self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
def test_siteone_resources(self):
"""
Test resource retrieval API functionality with various parameters.
"""
crawler = SiteOneCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
def test_interrobot_images(self):
"""
Test InterroBot-specific image handling and thumbnails.
"""
crawler = SiteOneCrawler(self._datasrc)
self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
def test_siteone_sorts(self):
"""
Test random sort functionality using the '?' sort parameter.
"""
crawler = SiteOneCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
def test_siteone_content_parsing(self):
"""
Test content type detection and parsing.
"""
crawler = SiteOneCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
def test_siteone_advanced_features(self):
"""
Test SiteOne-specific advanced features not covered by base tests.
"""
crawler = SiteOneCrawler(self._datasrc)
# numeric status operators (SiteOne-specific feature)
status_resources_gt = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="status: >400",
)
self.assertTrue(status_resources_gt.total > 0, "Numeric status operator should return results")
for resource in status_resources_gt._results:
self.assertGreater(resource.status, 400)
# redirect status codes
status_resources_redirect = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="status: 301"
)
self.assertTrue(status_resources_redirect.total > 0, "301 status filtering should return results")
for resource in status_resources_redirect._results:
self.assertEqual(resource.status, 301)
# 404 with size validation
status_resources_not_found = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="status: 404",
fields=["size"]
)
self.assertTrue(status_resources_not_found.total > 0, "404 status filtering should return results")
for resource in status_resources_not_found._results:
self.assertEqual(resource.status, 404)
not_found_result = status_resources_not_found._results[0].to_dict()
self.assertIn("size", not_found_result)
self.assertGreater(not_found_result["size"], 0, "404 responses should still have size > 0")
custom_fields = ["content", "headers", "time"]
field_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
fields=custom_fields
)
self.assertTrue(field_resources.total > 0)
# Test the SiteOne-specific forcefield dict method
resource_dict = field_resources._results[0].to_forcefield_dict(custom_fields)
for field in custom_fields:
self.assertIn(field, resource_dict, f"Field '{field}' should be in forcefield response")
def test_report(self):
"""
Run test report, save to data directory.
"""
crawler = SiteOneCrawler(self._datasrc)
logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "SiteOne"))
```
--------------------------------------------------------------------------------
/docs/_static/pygments.css:
--------------------------------------------------------------------------------
```css
pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight { background: #f8f8f8; }
.highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
.highlight .err { border: 1px solid #FF0000 } /* Error */
.highlight .k { color: #008000; font-weight: bold } /* Keyword */
.highlight .o { color: #666666 } /* Operator */
.highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
.highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
.highlight .cp { color: #9C6500 } /* Comment.Preproc */
.highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
.highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
.highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
.highlight .gd { color: #A00000 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #E40000 } /* Generic.Error */
.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
.highlight .gi { color: #008400 } /* Generic.Inserted */
.highlight .go { color: #717171 } /* Generic.Output */
.highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
.highlight .gt { color: #0044DD } /* Generic.Traceback */
.highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008000 } /* Keyword.Pseudo */
.highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #B00040 } /* Keyword.Type */
.highlight .m { color: #666666 } /* Literal.Number */
.highlight .s { color: #BA2121 } /* Literal.String */
.highlight .na { color: #687822 } /* Name.Attribute */
.highlight .nb { color: #008000 } /* Name.Builtin */
.highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */
.highlight .no { color: #880000 } /* Name.Constant */
.highlight .nd { color: #AA22FF } /* Name.Decorator */
.highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
.highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0000FF } /* Name.Function */
.highlight .nl { color: #767600 } /* Name.Label */
.highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
.highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #19177C } /* Name.Variable */
.highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #666666 } /* Literal.Number.Bin */
.highlight .mf { color: #666666 } /* Literal.Number.Float */
.highlight .mh { color: #666666 } /* Literal.Number.Hex */
.highlight .mi { color: #666666 } /* Literal.Number.Integer */
.highlight .mo { color: #666666 } /* Literal.Number.Oct */
.highlight .sa { color: #BA2121 } /* Literal.String.Affix */
.highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
.highlight .sc { color: #BA2121 } /* Literal.String.Char */
.highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
.highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
.highlight .s2 { color: #BA2121 } /* Literal.String.Double */
.highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
.highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
.highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
.highlight .sx { color: #008000 } /* Literal.String.Other */
.highlight .sr { color: #A45A77 } /* Literal.String.Regex */
.highlight .s1 { color: #BA2121 } /* Literal.String.Single */
.highlight .ss { color: #19177C } /* Literal.String.Symbol */
.highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0000FF } /* Name.Function.Magic */
.highlight .vc { color: #19177C } /* Name.Variable.Class */
.highlight .vg { color: #19177C } /* Name.Variable.Global */
.highlight .vi { color: #19177C } /* Name.Variable.Instance */
.highlight .vm { color: #19177C } /* Name.Variable.Magic */
.highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
```
--------------------------------------------------------------------------------
/docs/_static/js/theme.js:
--------------------------------------------------------------------------------
```javascript
!function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("<div class='wy-table-responsive'></div>"),n("table.docutils.footnote").wrap("<div class='wy-table-responsive footnote'></div>"),n("table.docutils.citation").wrap("<div class='wy-table-responsive citation'></div>"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n('<button class="toctree-expand" title="Open/close menu"></button>'),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t<e.length&&!window.requestAnimationFrame;++t)window.requestAnimationFrame=window[e[t]+"RequestAnimationFrame"],window.cancelAnimationFrame=window[e[t]+"CancelAnimationFrame"]||window[e[t]+"CancelRequestAnimationFrame"];window.requestAnimationFrame||(window.requestAnimationFrame=function(e,t){var i=(new Date).getTime(),o=Math.max(0,16-(i-n)),r=window.setTimeout((function(){e(i+o)}),o);return n=i+o,r}),window.cancelAnimationFrame||(window.cancelAnimationFrame=function(n){clearTimeout(n)})}()}).call(window)},function(n,e){n.exports=jQuery},function(n,e,t){}]);
```
--------------------------------------------------------------------------------
/docs/_static/sphinx_highlight.js:
--------------------------------------------------------------------------------
```javascript
/* Highlighting utilities for Sphinx HTML documentation. */
"use strict";
const SPHINX_HIGHLIGHT_ENABLED = true
/**
* highlight a given string on a node by wrapping it in
* span elements with the given class name.
*/
const _highlight = (node, addItems, text, className) => {
if (node.nodeType === Node.TEXT_NODE) {
const val = node.nodeValue;
const parent = node.parentNode;
const pos = val.toLowerCase().indexOf(text);
if (
pos >= 0 &&
!parent.classList.contains(className) &&
!parent.classList.contains("nohighlight")
) {
let span;
const closestNode = parent.closest("body, svg, foreignObject");
const isInSVG = closestNode && closestNode.matches("svg");
if (isInSVG) {
span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
} else {
span = document.createElement("span");
span.classList.add(className);
}
span.appendChild(document.createTextNode(val.substr(pos, text.length)));
const rest = document.createTextNode(val.substr(pos + text.length));
parent.insertBefore(
span,
parent.insertBefore(
rest,
node.nextSibling
)
);
node.nodeValue = val.substr(0, pos);
/* There may be more occurrences of search term in this node. So call this
* function recursively on the remaining fragment.
*/
_highlight(rest, addItems, text, className);
if (isInSVG) {
const rect = document.createElementNS(
"http://www.w3.org/2000/svg",
"rect"
);
const bbox = parent.getBBox();
rect.x.baseVal.value = bbox.x;
rect.y.baseVal.value = bbox.y;
rect.width.baseVal.value = bbox.width;
rect.height.baseVal.value = bbox.height;
rect.setAttribute("class", className);
addItems.push({ parent: parent, target: rect });
}
}
} else if (node.matches && !node.matches("button, select, textarea")) {
node.childNodes.forEach((el) => _highlight(el, addItems, text, className));
}
};
const _highlightText = (thisNode, text, className) => {
let addItems = [];
_highlight(thisNode, addItems, text, className);
addItems.forEach((obj) =>
obj.parent.insertAdjacentElement("beforebegin", obj.target)
);
};
/**
* Small JavaScript module for the documentation.
*/
const SphinxHighlight = {
/**
* highlight the search words provided in localstorage in the text
*/
highlightSearchWords: () => {
if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight
// get and clear terms from localstorage
const url = new URL(window.location);
const highlight =
localStorage.getItem("sphinx_highlight_terms")
|| url.searchParams.get("highlight")
|| "";
localStorage.removeItem("sphinx_highlight_terms")
url.searchParams.delete("highlight");
window.history.replaceState({}, "", url);
// get individual terms from highlight string
const terms = highlight.toLowerCase().split(/\s+/).filter(x => x);
if (terms.length === 0) return; // nothing to do
// There should never be more than one element matching "div.body"
const divBody = document.querySelectorAll("div.body");
const body = divBody.length ? divBody[0] : document.querySelector("body");
window.setTimeout(() => {
terms.forEach((term) => _highlightText(body, term, "highlighted"));
}, 10);
const searchBox = document.getElementById("searchbox");
if (searchBox === null) return;
searchBox.appendChild(
document
.createRange()
.createContextualFragment(
'<p class="highlight-link">' +
'<a href="javascript:SphinxHighlight.hideSearchWords()">' +
_("Hide Search Matches") +
"</a></p>"
)
);
},
/**
* helper function to hide the search marks again
*/
hideSearchWords: () => {
document
.querySelectorAll("#searchbox .highlight-link")
.forEach((el) => el.remove());
document
.querySelectorAll("span.highlighted")
.forEach((el) => el.classList.remove("highlighted"));
localStorage.removeItem("sphinx_highlight_terms")
},
initEscapeListener: () => {
// only install a listener if it is really needed
if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return;
document.addEventListener("keydown", (event) => {
// bail for input elements
if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
// bail with special keys
if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return;
if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) {
SphinxHighlight.hideSearchWords();
event.preventDefault();
}
});
},
};
_ready(() => {
/* Do not call highlightSearchWords() when we are on the search page.
* It will highlight words from the *previous* search query.
*/
if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords();
SphinxHighlight.initEscapeListener();
});
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/api.py:
--------------------------------------------------------------------------------
```python
import json
from datetime import datetime, timezone
from time import time
from typing import Any, Final
from mcp_server_webcrawl.crawlers.base.adapter import IndexState
from mcp_server_webcrawl.models.base import METADATA_VALUE_TYPE
from mcp_server_webcrawl.models.resources import ResourceResult, ResourceResultType
from mcp_server_webcrawl.models.sites import SiteResult
from mcp_server_webcrawl.utils import to_isoformat_zulu
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
OVERRIDE_ERROR_MESSAGE: Final[str] = "BaseCrawler subclasses must implement \
the following methods: handle_list_tools, handle_call_tool"
class BaseJsonApiEncoder(json.JSONEncoder):
"""
Custom JSON encoder for BaseJsonApi objects and ResourceResultType enums.
"""
def default(self, obj) -> Any:
"""
Override default encoder to handle custom types.
Args:
obj: Object to encode
Returns:
JSON serializable representation of the object
"""
if isinstance(obj, BaseJsonApi):
return obj.__dict__
elif isinstance(obj, ResourceResultType):
return obj.value
elif isinstance(obj, datetime):
return to_isoformat_zulu(obj)
return super().default(obj)
class BaseJsonApi:
"""
Base class for JSON API responses.
Provides a standardized structure for API responses including metadata,
results, and error handling.
"""
def __init__(self, method: str, args: dict[str, Any], index_state: IndexState | None = None):
"""
Construct with the arguments of creation (aoc), these will be echoed back in
JSON response. This is an object that collapses into json on json dumps. This is
done with everything within implementing to_dict.
Args:
method: API method name
args: Dictionary of API arguments
index_state: indexing, complete, remote, etc.
"""
from mcp_server_webcrawl import __version__, __name__
self._start_time = time()
self.method = method
self.args = args
self.meta_generator = f"{__name__} ({__version__})"
self.meta_generated = to_isoformat_zulu(datetime.now(timezone.utc))
self.meta_index = index_state.to_dict() if index_state is not None else None
self._results: list[SiteResult | ResourceResult] = []
self._results_total: int = 0
self._results_offset: int = 0
self._results_limit: int = 0
self._errors: list[str] = []
@property
def total(self) -> int:
"""
Returns the total number of results.
Returns:
Integer count of total results
"""
return self._results_total
def get_results(self) -> list[SiteResult | ResourceResult]:
"""
Returns list of results.
Returns:
Results of type SiteResult or ResourceResult
"""
return self._results.copy()
def set_results(self, results: list[SiteResult | ResourceResult], total: int, offset: int, limit: int) -> None:
"""
Set the results of the API response.
Args:
results: List of result objects
total: Total number of results (including those beyond limit)
offset: Starting position in the full result set
limit: Maximum number of results to include
"""
self._results = results
self._results_total = total
self._results_offset = offset
self._results_limit = limit
def append_error(self, message: str) -> None:
"""
Add an error to the JSON response, visible to the endpoint LLM.
Args:
message: Error message to add
"""
self._errors.append(message)
def to_dict(self) -> dict[str, METADATA_VALUE_TYPE]:
"""
Convert the object to a JSON-serializable dictionary.
Returns:
Dictionary representation of the API response
"""
response: dict[str, Any] = {
"__meta__": {
"generator": f"{self.meta_generator}",
"generated": f"{self.meta_generated}",
"request": {
"method": f"{self.method}",
"arguments": self.args,
"time": time() - self._start_time,
},
"results": {
"total": self._results_total,
"offset": self._results_offset,
"limit": self._results_limit,
},
},
"results": [r.to_forcefield_dict(self.args["fields"]) if hasattr(r, "to_forcefield_dict") else r for r in self._results]
}
if self.meta_index is not None:
response["__meta__"]["index"] = self.meta_index
if self._errors:
response["__meta__"]["errors"] = self._errors
return response
def to_json(self) -> str:
"""
Return a JSON serializable representation of this object.
Returns:
JSON string representation of the API response
"""
return json.dumps(self.to_dict(), indent=1, cls=BaseJsonApiEncoder)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/tests.py:
--------------------------------------------------------------------------------
```python
import asyncio
from logging import Logger
from mcp.types import EmbeddedResource, ImageContent, TextContent
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers.interrobot.crawler import InterroBotCrawler
from mcp_server_webcrawl.models.resources import RESOURCES_TOOL_NAME
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger
# these IDs belong to the db test fixture (interrobot.v2.db)
EXAMPLE_SITE_ID = 1
PRAGMAR_SITE_ID = 2
logger: Logger = get_logger()
class InterroBotTests(BaseCrawlerTests):
"""
Test suite for the InterroBot crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests plus InterroBot-specific features.
"""
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self.fixture_path = get_fixture_directory() / "interrobot" / "interrobot.v2.db"
def test_interrobot_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = InterroBotCrawler(self.fixture_path)
self.assertIsNotNone(crawler)
def test_interrobot_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = InterroBotCrawler(self.fixture_path)
# Note: InterroBot uses site ID 2 for pragmar instead of calculating from string
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
def test_interrobot_search(self):
"""
Test boolean search functionality
"""
crawler = InterroBotCrawler(self.fixture_path)
self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
def test_interrobot_resources(self):
"""
Test resource retrieval API functionality with various parameters.
"""
crawler = InterroBotCrawler(self.fixture_path)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
def test_interrobot_images(self):
"""
Test InterroBot-specific image handling and thumbnails.
"""
crawler = InterroBotCrawler(self.fixture_path)
self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
def test_interrobot_sorts(self):
"""
Test random sort functionality using the '?' sort parameter.
"""
crawler = InterroBotCrawler(self.fixture_path)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
def test_interrobot_content_parsing(self):
"""
Test content type detection and parsing.
"""
crawler = InterroBotCrawler(self.fixture_path)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
def test_interrobot_mcp_features(self):
"""
Test InterroBot-specific MCP tool functionality.
"""
crawler = InterroBotCrawler(self.fixture_path)
list_tools_result = asyncio.run(crawler.mcp_list_tools())
self.assertIsNotNone(list_tools_result)
def test_thumbnails_sync(self):
"""
Test thumbnail generation functionality.
"""
asyncio.run(self.__test_thumbnails())
async def __test_thumbnails(self):
"""
Test thumbnails are a special case for InterroBot. Other fixtures are
not dependable, either images removed to slim archive, or not captured
with defaults. Testing thumbnails here is enough.
"""
crawler = InterroBotCrawler(self.fixture_path)
thumbnail_args = {
"datasrc": crawler.datasrc,
"sites": [PRAGMAR_SITE_ID],
"extras": ["thumbnails"],
"query": "type: img AND url: *.png",
"limit": 4,
}
thumbnail_result: list[TextContent | ImageContent | EmbeddedResource] = await crawler.mcp_call_tool(
RESOURCES_TOOL_NAME, thumbnail_args
)
if len(thumbnail_result) > 1:
self.assertTrue(
thumbnail_result[1].type == "image",
"ImageContent should be included in thumbnails response"
)
def test_interrobot_advanced_site_features(self):
"""
Test InterroBot-specific site features like robots field.
"""
crawler = InterroBotCrawler(self.fixture_path)
# robots field retrieval
site_one_field_json = crawler.get_sites_api(ids=[1], fields=["urls"])
if site_one_field_json.total > 0:
result_dict = site_one_field_json._results[0].to_dict()
self.assertIn("urls", result_dict, "robots field should be present in response")
# multiple custom fields
site_multiple_fields_json = crawler.get_sites_api(ids=[1], fields=["urls", "created"])
if site_multiple_fields_json.total > 0:
result = site_multiple_fields_json._results[0].to_dict()
self.assertIn("urls", result, "robots field should be present in response")
self.assertIn("created", result, "created field should be present in response")
def test_report(self):
"""
Run test report, save to data directory.
"""
crawler = InterroBotCrawler(self.fixture_path)
logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "InterroBot"))
```
--------------------------------------------------------------------------------
/docs/_sources/guides/archivebox.rst.txt:
--------------------------------------------------------------------------------
```
ArchiveBox MCP Setup Guide
==========================
Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with `ArchiveBox <https://archivebox.io/>`_.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've archived using ArchiveBox.
.. raw:: html
<iframe width="560" height="315" src="https://www.youtube.com/embed/0KFqhSYf3f4" frameborder="0" allowfullscreen></iframe>
Follow along with the video, or the step-action guide below.
Requirements
------------
Before you begin, ensure you have:
- `Claude Desktop <https://claude.ai/download>`_ installed
- `Python <https://python.org>`_ 3.10 or later installed
- `ArchiveBox <https://archivebox.io/>`_ installed
- Basic familiarity with command line interfaces
What is ArchiveBox?
-------------------
ArchiveBox is a powerful open-source web archiving solution that offers:
- Multiple output formats (HTML, PDF, screenshots, WARC, etc.)
- Comprehensive metadata
- CLI + webadmin for browsing and managing archives
- Support for various input sources (URLs, browser bookmarks, RSS feeds)
- Self-hosted solution for long-term web content preservation
Installation Steps
------------------
1. Install mcp-server-webcrawl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Open your terminal or command line and install the package::
pip install mcp-server-webcrawl
Verify installation was successful::
mcp-server-webcrawl --help
2. Install and Set Up ArchiveBox
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
macOS/Linux only, Windows may work under Docker but is untested.
1. Install ArchiveBox (macOS/Linux)::
pip install archivebox
2. macOS only, install brew and wget::
brew install wget
3. Create ArchiveBox collections. Unlike other crawlers that focus on single websites, ArchiveBox uses a collection-based approach where each collection can contain multiple URLs. You can create separate content for different projects or group related URLs together::
# Create a directory structure for your collections
mkdir ~/archivebox-data
# Create an "example" collection
mkdir ~/archivebox-data/example
cd ~/archivebox-data/example
archivebox init
archivebox add https://example.com
# Create a "pragmar" collection
mkdir ~/archivebox-data/pragmar
cd ~/archivebox-data/pragmar
archivebox init
archivebox add https://pragmar.com
4. Each ``archivebox init`` creates a complete ArchiveBox instance with its own database and archive directory structure. The typical structure includes::
collection-name/
├── archive/ # Archived content organized by timestamp
├── logs/ # ArchiveBox operation logs
├── sources/ # Source URL lists and metadata
└── index.sqlite3 # Database containing all metadata
3. Configure Claude Desktop
~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Open Claude Desktop
2. Go to **File → Settings → Developer → Edit Config**
3. Add the following configuration (modify paths as needed):
.. code-block:: json
{
"mcpServers": {
"webcrawl": {
"command": "/path/to/mcp-server-webcrawl",
"args": ["--crawler", "archivebox", "--datasrc",
"/path/to/archivebox-data/"]
}
}
}
.. note::
- On Windows, use ``"mcp-server-webcrawl"`` as the command
- On macOS/Linux, use the absolute path (output of ``which mcp-server-webcrawl``)
- The datasrc path should point to the parent directory containing your ArchiveBox collections (e.g., ``~/archivebox-data/``), not to individual collection directories
- Each collection directory (example, pragmar, etc.) will appear as a separate "site" in MCP
4. Save the file and **completely exit** Claude Desktop (not just close the window)
5. Restart Claude Desktop
4. Verify and Use
~~~~~~~~~~~~~~~~~
1. In Claude Desktop, you should now see MCP tools available under Search and Tools
2. Ask Claude to list your archived sites::
Can you list the crawled sites available?
3. Try searching content from your archives::
Can you find information about [topic] on [archived site]?
4. Use the rich metadata for content discovery::
Can you find all the archived pages related to [keyword] from [archive]?
Troubleshooting
---------------
- If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
- Ensure Python and mcp-server-webcrawl are properly installed
- Check that your ArchiveBox archive directory path in the configuration is correct
- Make sure ArchiveBox has successfully archived the websites and created the database
- Verify that files exist in your archive/[timestamp] directories
- Remember that the first time you use a function, Claude will ask for permission
- For large archives, initial indexing may take some time during the first search
ArchiveBox's comprehensive archiving capabilities combined with mcp-server-webcrawl provide powerful tools for content preservation, research, and analysis across your archived web content.
For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/server.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.utils.server — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.utils.server</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.utils.server</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">sys</span>
<div class="viewcode-block" id="initialize_mcp_server">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.server.initialize_mcp_server">[docs]</a>
<span class="k">def</span> <span class="nf">initialize_mcp_server</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> MCP stdio streams require utf-8 explicitly set for Windows (default cp1252)</span>
<span class="sd"> or internationalized content will fail.</span>
<span class="sd"> """</span>
<span class="k">if</span> <span class="n">sys</span><span class="o">.</span><span class="n">platform</span> <span class="o">==</span> <span class="s2">"win32"</span> <span class="ow">and</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"PYTHONIOENCODING"</span><span class="p">)</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">stdin</span><span class="o">.</span><span class="n">reconfigure</span><span class="p">(</span><span class="n">encoding</span><span class="o">=</span><span class="s2">"utf-8"</span><span class="p">)</span>
<span class="n">sys</span><span class="o">.</span><span class="n">stdout</span><span class="o">.</span><span class="n">reconfigure</span><span class="p">(</span><span class="n">encoding</span><span class="o">=</span><span class="s2">"utf-8"</span><span class="p">)</span>
<span class="n">sys</span><span class="o">.</span><span class="n">stderr</span><span class="o">.</span><span class="n">reconfigure</span><span class="p">(</span><span class="n">encoding</span><span class="o">=</span><span class="s2">"utf-8"</span><span class="p">)</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```