pragmar/mcp_server_webcrawl # codebase.md

This is page 1 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│   ├── _images
│   │   ├── interactive.document.webp
│   │   ├── interactive.search.webp
│   │   └── mcpswc.svg
│   ├── _modules
│   │   ├── index.html
│   │   ├── mcp_server_webcrawl
│   │   │   ├── crawlers
│   │   │   │   ├── archivebox
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── base
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── api.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   ├── indexed.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── httrack
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── interrobot
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── katana
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── siteone
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── warc
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   └── wget
│   │   │   │       ├── adapter.html
│   │   │   │       ├── crawler.html
│   │   │   │       └── tests.html
│   │   │   ├── crawlers.html
│   │   │   ├── extras
│   │   │   │   ├── markdown.html
│   │   │   │   ├── regex.html
│   │   │   │   ├── snippets.html
│   │   │   │   ├── thumbnails.html
│   │   │   │   └── xpath.html
│   │   │   ├── interactive
│   │   │   │   ├── highlights.html
│   │   │   │   ├── search.html
│   │   │   │   ├── session.html
│   │   │   │   └── ui.html
│   │   │   ├── main.html
│   │   │   ├── models
│   │   │   │   ├── resources.html
│   │   │   │   └── sites.html
│   │   │   ├── templates
│   │   │   │   └── tests.html
│   │   │   ├── utils
│   │   │   │   ├── blobs.html
│   │   │   │   ├── cli.html
│   │   │   │   ├── logger.html
│   │   │   │   ├── querycache.html
│   │   │   │   ├── server.html
│   │   │   │   └── tools.html
│   │   │   └── utils.html
│   │   └── re.html
│   ├── _sources
│   │   ├── guides
│   │   │   ├── archivebox.rst.txt
│   │   │   ├── httrack.rst.txt
│   │   │   ├── interrobot.rst.txt
│   │   │   ├── katana.rst.txt
│   │   │   ├── siteone.rst.txt
│   │   │   ├── warc.rst.txt
│   │   │   └── wget.rst.txt
│   │   ├── guides.rst.txt
│   │   ├── index.rst.txt
│   │   ├── installation.rst.txt
│   │   ├── interactive.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.base.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│   │   ├── mcp_server_webcrawl.extras.rst.txt
│   │   ├── mcp_server_webcrawl.interactive.rst.txt
│   │   ├── mcp_server_webcrawl.models.rst.txt
│   │   ├── mcp_server_webcrawl.rst.txt
│   │   ├── mcp_server_webcrawl.templates.rst.txt
│   │   ├── mcp_server_webcrawl.utils.rst.txt
│   │   ├── modules.rst.txt
│   │   ├── prompts.rst.txt
│   │   └── usage.rst.txt
│   ├── _static
│   │   ├── _sphinx_javascript_frameworks_compat.js
│   │   ├── basic.css
│   │   ├── css
│   │   │   ├── badge_only.css
│   │   │   ├── fonts
│   │   │   │   ├── fontawesome-webfont.eot
│   │   │   │   ├── fontawesome-webfont.svg
│   │   │   │   ├── fontawesome-webfont.ttf
│   │   │   │   ├── fontawesome-webfont.woff
│   │   │   │   ├── fontawesome-webfont.woff2
│   │   │   │   ├── lato-bold-italic.woff
│   │   │   │   ├── lato-bold-italic.woff2
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-normal-italic.woff
│   │   │   │   ├── lato-normal-italic.woff2
│   │   │   │   ├── lato-normal.woff
│   │   │   │   ├── lato-normal.woff2
│   │   │   │   ├── Roboto-Slab-Bold.woff
│   │   │   │   ├── Roboto-Slab-Bold.woff2
│   │   │   │   ├── Roboto-Slab-Regular.woff
│   │   │   │   └── Roboto-Slab-Regular.woff2
│   │   │   └── theme.css
│   │   ├── doctools.js
│   │   ├── documentation_options.js
│   │   ├── file.png
│   │   ├── fonts
│   │   │   ├── Lato
│   │   │   │   ├── lato-bold.eot
│   │   │   │   ├── lato-bold.ttf
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-bolditalic.eot
│   │   │   │   ├── lato-bolditalic.ttf
│   │   │   │   ├── lato-bolditalic.woff
│   │   │   │   ├── lato-bolditalic.woff2
│   │   │   │   ├── lato-italic.eot
│   │   │   │   ├── lato-italic.ttf
│   │   │   │   ├── lato-italic.woff
│   │   │   │   ├── lato-italic.woff2
│   │   │   │   ├── lato-regular.eot
│   │   │   │   ├── lato-regular.ttf
│   │   │   │   ├── lato-regular.woff
│   │   │   │   └── lato-regular.woff2
│   │   │   └── RobotoSlab
│   │   │       ├── roboto-slab-v7-bold.eot
│   │   │       ├── roboto-slab-v7-bold.ttf
│   │   │       ├── roboto-slab-v7-bold.woff
│   │   │       ├── roboto-slab-v7-bold.woff2
│   │   │       ├── roboto-slab-v7-regular.eot
│   │   │       ├── roboto-slab-v7-regular.ttf
│   │   │       ├── roboto-slab-v7-regular.woff
│   │   │       └── roboto-slab-v7-regular.woff2
│   │   ├── images
│   │   │   ├── interactive.document.png
│   │   │   ├── interactive.document.webp
│   │   │   ├── interactive.search.png
│   │   │   ├── interactive.search.webp
│   │   │   └── mcpswc.svg
│   │   ├── jquery.js
│   │   ├── js
│   │   │   ├── badge_only.js
│   │   │   ├── theme.js
│   │   │   └── versions.js
│   │   ├── language_data.js
│   │   ├── minus.png
│   │   ├── plus.png
│   │   ├── pygments.css
│   │   ├── searchtools.js
│   │   └── sphinx_highlight.js
│   ├── .buildinfo
│   ├── .nojekyll
│   ├── genindex.html
│   ├── guides
│   │   ├── archivebox.html
│   │   ├── httrack.html
│   │   ├── interrobot.html
│   │   ├── katana.html
│   │   ├── siteone.html
│   │   ├── warc.html
│   │   └── wget.html
│   ├── guides.html
│   ├── index.html
│   ├── installation.html
│   ├── interactive.html
│   ├── mcp_server_webcrawl.crawlers.archivebox.html
│   ├── mcp_server_webcrawl.crawlers.base.html
│   ├── mcp_server_webcrawl.crawlers.html
│   ├── mcp_server_webcrawl.crawlers.httrack.html
│   ├── mcp_server_webcrawl.crawlers.interrobot.html
│   ├── mcp_server_webcrawl.crawlers.katana.html
│   ├── mcp_server_webcrawl.crawlers.siteone.html
│   ├── mcp_server_webcrawl.crawlers.warc.html
│   ├── mcp_server_webcrawl.crawlers.wget.html
│   ├── mcp_server_webcrawl.extras.html
│   ├── mcp_server_webcrawl.html
│   ├── mcp_server_webcrawl.interactive.html
│   ├── mcp_server_webcrawl.models.html
│   ├── mcp_server_webcrawl.templates.html
│   ├── mcp_server_webcrawl.utils.html
│   ├── modules.html
│   ├── objects.inv
│   ├── prompts.html
│   ├── py-modindex.html
│   ├── search.html
│   ├── searchindex.js
│   └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│   ├── audit404.md
│   ├── auditfiles.md
│   ├── auditperf.md
│   ├── auditseo.md
│   ├── gopher.md
│   ├── README.md
│   └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│   ├── _static
│   │   └── images
│   │       ├── interactive.document.png
│   │       ├── interactive.document.webp
│   │       ├── interactive.search.png
│   │       ├── interactive.search.webp
│   │       └── mcpswc.svg
│   ├── _templates
│   │   └── layout.html
│   ├── conf.py
│   ├── guides
│   │   ├── archivebox.rst
│   │   ├── httrack.rst
│   │   ├── interrobot.rst
│   │   ├── katana.rst
│   │   ├── siteone.rst
│   │   ├── warc.rst
│   │   └── wget.rst
│   ├── guides.rst
│   ├── index.rst
│   ├── installation.rst
│   ├── interactive.rst
│   ├── make.bat
│   ├── Makefile
│   ├── mcp_server_webcrawl.crawlers.archivebox.rst
│   ├── mcp_server_webcrawl.crawlers.base.rst
│   ├── mcp_server_webcrawl.crawlers.httrack.rst
│   ├── mcp_server_webcrawl.crawlers.interrobot.rst
│   ├── mcp_server_webcrawl.crawlers.katana.rst
│   ├── mcp_server_webcrawl.crawlers.rst
│   ├── mcp_server_webcrawl.crawlers.siteone.rst
│   ├── mcp_server_webcrawl.crawlers.warc.rst
│   ├── mcp_server_webcrawl.crawlers.wget.rst
│   ├── mcp_server_webcrawl.extras.rst
│   ├── mcp_server_webcrawl.interactive.rst
│   ├── mcp_server_webcrawl.models.rst
│   ├── mcp_server_webcrawl.rst
│   ├── mcp_server_webcrawl.templates.rst
│   ├── mcp_server_webcrawl.utils.rst
│   ├── modules.rst
│   ├── prompts.rst
│   ├── readme.txt
│   └── usage.rst
└── src
    └── mcp_server_webcrawl
        ├── __init__.py
        ├── crawlers
        │   ├── __init__.py
        │   ├── archivebox
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── base
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── api.py
        │   │   ├── crawler.py
        │   │   ├── indexed.py
        │   │   └── tests.py
        │   ├── httrack
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── interrobot
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── katana
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── siteone
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── warc
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   └── wget
        │       ├── __init__.py
        │       ├── adapter.py
        │       ├── crawler.py
        │       └── tests.py
        ├── extras
        │   ├── __init__.py
        │   ├── markdown.py
        │   ├── regex.py
        │   ├── snippets.py
        │   ├── thumbnails.py
        │   └── xpath.py
        ├── interactive
        │   ├── __init__.py
        │   ├── highlights.py
        │   ├── search.py
        │   ├── session.py
        │   ├── ui.py
        │   └── views
        │       ├── base.py
        │       ├── document.py
        │       ├── help.py
        │       ├── requirements.py
        │       ├── results.py
        │       └── searchform.py
        ├── main.py
        ├── models
        │   ├── __init__.py
        │   ├── base.py
        │   ├── resources.py
        │   └── sites.py
        ├── settings.py
        ├── templates
        │   ├── __init__.py
        │   ├── markdown.xslt
        │   ├── tests_core.html
        │   └── tests.py
        └── utils
            ├── __init__.py
            ├── cli.py
            ├── logger.py
            ├── parser.py
            ├── parsetab.py
            ├── search.py
            ├── server.py
            ├── tests.py
            └── tools.py
```

# Files

--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------

```
1 | 
```

--------------------------------------------------------------------------------
/docs/.buildinfo:
--------------------------------------------------------------------------------

```
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: fb2a90e507460b57acda26968ed00c09
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
 1 | venv/
 2 | env/
 3 | .env/
 4 | .venv/
 5 | temp/
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | *.log
10 | *.log.txt
11 | .vscode/
12 | .DS_Store
13 | .pytest_cache/
14 | *.egg-info/
15 | dist/
16 | build/
17 | fixtures-report.log
18 | 
19 | prompts/archive/
20 | fixtures/
21 | 
22 | settings_local.py
23 | *.db-shm
24 | *.db-wal
25 | 
26 | sphinx/_build/
27 | sphinx/build/
28 | sphinx/api/
29 | sphinx/_static/images/generated/
30 | 
31 | *.doctree
32 | *.pickle
33 | *.mo
34 | .doctrees/
35 | 
36 | **/__pycache__/
37 | .coverage
38 | parser.out
39 | htmlcov/
40 | soho.nascom.nasa.com/
41 | *.7z
42 | *.diff
43 | *instructions.txt
44 | README_pypi.md
45 | 
```

--------------------------------------------------------------------------------
/sphinx/readme.txt:
--------------------------------------------------------------------------------

```
1 | 
2 | # to build docs
3 | mcp_server_webcrawl> 
4 | sphinx-apidoc -o sphinx src/mcp_server_webcrawl
5 | sphinx-build -b html ./sphinx ./docs
6 | 
```

--------------------------------------------------------------------------------
/prompts/README.md:
--------------------------------------------------------------------------------

```markdown
 1 | # Prompt Routines
 2 | 
 3 | **mcp-server-webcrawl** provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.
 4 | 
 5 | It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are **copy and paste**, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.
 6 | 
 7 | If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type "run pasted for [site name or URL]." It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).
 8 | 
 9 | | Prompt | Download | Category | Description |
10 | |--------|----------|----------|-------------|
11 | |🔍 **SEO Audit** | [`auditseo.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md) | audit | Technical SEO (search engine optimization) analysis. Covers the basics, with options to dive deeper. |
12 | |🔗 **404 Audit** | [`audit404.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md) | audit | Broken link detection and pattern analysis. Not only finds issues, but suggests fixes. |
13 | |⚡&nbsp;**Performance&nbsp;Audit** | [`auditperf.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md) | audit | Website speed and optimization analysis. Real talk. |
14 | |📁 **File Audit** | [`auditfiles.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md) | audit | File organization and asset analysis. Discover the composition of your website. |
15 | |🌐 **Gopher Interface** | [`gopher.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md) | interface | An old-fashioned search interface inspired by the Gopher clients of yesteryear. |
16 | |⚙️ **Search Test** | [`testsearch.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md) | self-test | A battery of tests to check for Boolean logical inconsistencies in the search query parser and subsequent FTS5 conversion. |
17 | 
18 | 
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | <p align="center">
  2 |   <img src="sphinx/_static/images/mcpswc.svg" alt="MCP Server Webcrawl" width="60%">
  3 | </p>
  4 | 
  5 | <p align="center">
  6 |   <a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 10px;">Website</a> |
  7 |   <a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 10px;">GitHub</a> |
  8 |   <a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 10px;">Docs</a> |
  9 |   <a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 10px;">PyPi</a>
 10 | </p>
 11 | 
 12 | # mcp-server-webcrawl
 13 | 
 14 | Advanced search and retrieval for web crawler data. With **mcp-server-webcrawl**, your AI client filters and analyzes web content under your direction or autonomously. The server includes a fulltext search interface with boolean support, and resource filtering by type, HTTP status, and more.
 15 | 
 16 | **mcp-server-webcrawl** provides the LLM a complete menu with which to search, and works with a variety of web crawlers:
 17 | 
 18 | | Crawler/Format | Description | Platforms | Setup Guide |
 19 | |---|---|---|---|
 20 | | [**ArchiveBox**][1] | Web archiving tool | macOS/Linux | [Setup Guide][8] |
 21 | | [**HTTrack**][2] | GUI mirroring tool | macOS/Windows/Linux | [Setup Guide][9] |
 22 | | [**InterroBot**][3] | GUI crawler and analyzer | macOS/Windows/Linux | [Setup Guide][10] |
 23 | | [**Katana**][4] | CLI security-focused crawler | macOS/Windows/Linux | [Setup Guide][11] |
 24 | | [**SiteOne**][5] | GUI crawler and analyzer | macOS/Windows/Linux | [Setup Guide][12] |
 25 | | [**WARC**][6] | Standard web archive format | varies by client | [Setup Guide][13] |
 26 | | [**wget**][7] | CLI website mirroring tool | macOS/Linux | [Setup Guide][14] |
 27 | 
 28 | [1]: https://archivebox.io
 29 | [2]: https://github.com/xroche/httrack
 30 | [3]: https://interro.bot
 31 | [4]: https://github.com/projectdiscovery/katana
 32 | [5]: https://crawler.siteone.io
 33 | [6]: https://en.wikipedia.org/wiki/WARC_(file_format)
 34 | [7]: https://en.wikipedia.org/wiki/Wget
 35 | [8]: https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html
 36 | [9]: https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html
 37 | [10]: https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html
 38 | [11]: https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html
 39 | [12]: https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html
 40 | [13]: https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html
 41 | [14]: https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html
 42 | 
 43 | **mcp-server-webcrawl** is free and open source, and requires Claude Desktop and Python (>=3.10). It is installed on the command line, via pip install:
 44 | 
 45 | ```bash
 46 | pip install mcp-server-webcrawl
 47 | ```
 48 | 
 49 | For step-by-step MCP server setup, refer to the [Setup Guides](https://pragmar.github.io/mcp-server-webcrawl/guides.html).
 50 | 
 51 | ## Features
 52 | 
 53 | * Claude Desktop ready
 54 | * Multi-crawler compatible
 55 | * Filter by type, status, and more
 56 | * Boolean search support
 57 | * Support for Markdown and snippets
 58 | * Roll your own website knowledgebase
 59 | 
 60 | ## Prompt Routines
 61 | 
 62 | **mcp-server-webcrawl** provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.
 63 | 
 64 | It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are **copy and paste**, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.
 65 | 
 66 | | Prompt | Download | Category | Description |
 67 | |--------|----------|----------|-------------|
 68 | |🔍 **SEO Audit** | [`auditseo.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md) | audit | Technical SEO (search engine optimization) analysis. Covers the basics, with options to dive deeper. |
 69 | |🔗 **404 Audit** | [`audit404.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md) | audit | Broken link detection and pattern analysis. Not only finds issues, but suggests fixes. |
 70 | |⚡&nbsp;**Performance&nbsp;Audit** | [`auditperf.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md) | audit | Website speed and optimization analysis. Real talk. |
 71 | |📁 **File Audit** | [`auditfiles.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md) | audit | File organization and asset analysis. Discover the composition of your website. |
 72 | |🌐 **Gopher Interface** | [`gopher.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md) | interface | An old-fashioned search interface inspired by the Gopher clients of yesteryear. |
 73 | |⚙️ **Search Test** | [`testsearch.md`](https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md) | self-test | A battery of tests to check for Boolean logical inconsistencies in the search query parser and subsequent FTS5 conversion. |
 74 | 
 75 | If you want to shortcut the site selection (one less query), paste the markdown and in the same request, type "run pasted for [site name or URL]." It will figure it out. When pasted without additional context, you should be prompted to select from a list of crawled sites.
 76 | 
 77 | ## Boolean Search Syntax
 78 | 
 79 | The query engine supports field-specific (`field: value`) searches and complex boolean expressions. Fulltext is supported as a combination of the url, content, and headers fields.
 80 | 
 81 | While the API interface is designed to be consumed by the LLM directly, it can be helpful to familiarize yourself with the search syntax. Searches generated by the LLM are inspectable, but generally collapsed in the UI. If you need to see the query, expand the MCP collapsible.
 82 | 
 83 | **Example Queries**
 84 | 
 85 | | Query Example | Description |
 86 | |--------------|-------------|
 87 | | privacy | fulltext single keyword match |
 88 | | "privacy policy" | fulltext match exact phrase |
 89 | | boundar* | fulltext wildcard matches results starting with *boundar* (boundary, boundaries) |
 90 | | id: 12345 | id field matches a specific resource by ID |
 91 | | url: example.com/somedir | url field matches results with URL containing example.com/somedir |
 92 | | type: html | type field matches for HTML pages only |
 93 | | status: 200 | status field matches specific HTTP status codes (equal to 200) |
 94 | | status: >=400 | status field matches specific HTTP status code (greater than or equal to 400) |
 95 | | content: h1 | content field matches content (HTTP response body, often, but not always HTML) |
 96 | | headers: text/xml | headers field matches HTTP response headers |
 97 | | privacy AND policy | fulltext matches both |
 98 | | privacy OR policy | fulltext matches either |
 99 | | policy NOT privacy | fulltext matches policies not containing privacy |
100 | | (login OR signin) AND form | fulltext matches fulltext login or signin with form |
101 | | type: html AND status: 200 | fulltext matches only HTML pages with HTTP success |
102 | 
103 | ## Field Search Definitions
104 | 
105 | Field search provides search precision, allowing you to specify which columns of the search index to filter. Rather than searching the entire content, you can restrict your query to specific attributes like URLs, headers, or content body. This approach improves efficiency when looking for specific attributes or patterns within crawl data.
106 | 
107 | | Field | Description |
108 | |-------|-------------|
109 | | id | database ID |
110 | | url | resource URL |
111 | | type | enumerated list of types (see types table) |
112 | | size | file size in bytes |
113 | | status | HTTP response codes |
114 | | headers | HTTP response headers |
115 | | content | HTTP body—HTML, CSS, JS, and more |
116 | 
117 | ## Field Content
118 | 
119 | A subset of fields can be independently requested with results, while core fields are always on. Use of headers and content can consume tokens quickly. Use judiciously, or use extras to crunch more results into the context window. Fields are a top level argument, independent of any field searching taking place in the query.
120 | 
121 | | Field | Description |
122 | |-------|-------------|
123 | | id | always available |
124 | | url | always available |
125 | | type | always available |
126 | | status | always available |
127 | | created | on request |
128 | | modified | on request |
129 | | size | on request |
130 | | headers | on request |
131 | | content | on request |
132 | 
133 | ## Content Types
134 | 
135 | Crawls contain resource types beyond HTML pages. The `type:` field search allows filtering by broad content type groups, particularly useful when filtering images without complex extension queries. For example, you might search for `type: html NOT content: login` to find pages without "login," or `type: img` to analyze image resources. The table below lists all supported content types in the search system.
136 | 
137 | | Type | Description |
138 | |------|-------------|
139 | | html | webpages |
140 | | iframe | iframes |
141 | | img | web images |
142 | | audio | web audio files |
143 | | video | web video files |
144 | | font | web font files |
145 | | style | CSS stylesheets |
146 | | script | JavaScript files |
147 | | rss | RSS syndication feeds |
148 | | text | plain text content |
149 | | pdf | PDF files |
150 | | doc | MS Word documents |
151 | | other | uncategorized |
152 | 
153 | ## Extras
154 | 
155 | The `extras` parameter provides additional processing options, transforming HTTP data (markdown, snippets, regex, xpath), or connecting the LLM to external data (thumbnails). These options can be combined as needed to achieve the desired result format.
156 | 
157 | | Extra | Description |
158 | |-------|-------------|
159 | | thumbnails | Generates base64 encoded images to be viewed and analyzed by AI models. Enables image description, content analysis, and visual understanding while keeping token output minimal. Works with images, which can be filtered using `type: img` in queries. SVG is not supported. |
160 | | markdown | Provides the HTML content field as concise Markdown, reducing token usage and improving readability for LLMs. Works with HTML, which can be filtered using `type: html` in queries. |
161 | | regex | Extracts regular expression matches from crawled files such as HTML, CSS, JavaScript, etc. Not as precise a tool as XPath for HTML, but supports any text file as a data source. One or more regex patterns can be requested, using the `extrasRegex` argument. |
162 | | snippets | Matches fulltext queries to contextual keyword usage within the content. When used without requesting the content field (or markdown extra), it can provide an efficient means of refining a search without pulling down the complete page contents. Also great for rendering old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, or any text-based, crawled file. |
163 | | xpath | Extracts XPath selector data, used in scraping HTML content. Use XPath's text() selector for text-only, element selectors return outerHTML. Only supported with `type: html`, other types will be ignored. One or more XPath selectors (//h1, count(//h1), etc.) can be requested, using the `extrasXpath` argument. |
164 | 
165 | Extras provide a means of producing token-efficient HTTP content responses. Markdown produces roughly 1/3 the bytes of the source HTML, snippets are generally 500 or so bytes per result, and XPath can be as specific or broad as you choose. The more focused your requests, the more results you can fit into your LLM session.
166 | 
167 | The idea, of course, is that the LLM takes care of this for you. If you notice your LLM developing an affinity to the "content" field (full HTML), a nudge in chat to budget tokens using the extras feature should be all that is needed.
168 | 
169 | ## Interactive Mode
170 | 
171 | **No AI, just classic Boolean search of your web-archives in a terminal.**
172 | 
173 | mcp-server-webcrawl can double as a terminal search for your web archives. You can run it against your local archives, but it gets more interesting when you realize you can ssh into any remote host and view archives sitting on that host. No downloads, syncs, multifactor logins, or other common drudgery required. With interactive mode, you can be in and searching a crawl sitting on a remote server in no time at all.
174 | 
175 | Launch with --crawler and --datasrc to search immediately, or setup datasrc and crawler in-app.
176 | 
177 | ```bash
178 | mcp-server-webcrawl --crawler wget --datasrc /path/to/datasrc --interactive
179 | # or manually enter crawler and datasrc in the UI
180 | mcp-server-webcrawl --interactive
181 | ```
182 | 
183 | Interactive mode is a way to search through tranches of crawled data, whenever, whereever... in a terminal.
184 | 
185 | ![Interactive search interface](sphinx/_static/images/interactive.search.webp)
```

--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------

```markdown
 1 | # Contributing
 2 | 
 3 | Thank you for your interest in contributing.
 4 | 
 5 | This project is still finding its footing, and I'm keeping things simple for now.
 6 | 
 7 | ## The Process
 8 | 
 9 | **Start with an issue.** Before writing code, open an issue to discuss the problem being addressed.
10 | 
11 | This saves everyone time and ensures we're aligned.
12 | 
13 | ## What I'm Looking For
14 | 
15 | * Bugs with clear reproduction steps
16 | * LLM API hangups that I may be unaware of (fixations, etc. leading to extra queries or misleading results)
17 | * New and exciting prompts utilizing the embedded search method
18 | 
19 | ## Guidelines
20 | 
21 | * This is evolving as I figure out what works
22 | * No additional package requirements, please
23 | * No new crawlers without viable long-term support (forking is encouraged!)
24 | * Follow existing code style
25 | * Include tests where appropriate
26 | * Keep PRs focused and reasonably sized
27 | 
28 | I'm currently pushing to master, it's been a solo effort as of writing. If and when higher-level git workflow is required, I'll reorganize the repo into something more community-oriented.
29 | 
30 | Questions? Open an issue and let's talk.
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/templates/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/sphinx/_templates/layout.html:
--------------------------------------------------------------------------------

```html
1 | {% extends "!layout.html" %}
2 | 
```

--------------------------------------------------------------------------------
/docs/_sources/modules.rst.txt:
--------------------------------------------------------------------------------

```
1 | mcp_server_webcrawl
2 | ===================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    mcp_server_webcrawl
8 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/__init__.py:
--------------------------------------------------------------------------------

```python
1 | # this is what is acceptable metadata content for crawl results
2 | METADATA_VALUE_TYPE = str | int | float | bool | list[str] | list[int] | list[float] | None
3 | 
```

--------------------------------------------------------------------------------
/docs/_static/documentation_options.js:
--------------------------------------------------------------------------------

```javascript
 1 | const DOCUMENTATION_OPTIONS = {
 2 |     VERSION: '',
 3 |     LANGUAGE: 'en',
 4 |     COLLAPSE_INDEX: false,
 5 |     BUILDER: 'html',
 6 |     FILE_SUFFIX: '.html',
 7 |     LINK_SUFFIX: '.html',
 8 |     HAS_SOURCE: true,
 9 |     SOURCELINK_SUFFIX: '.txt',
10 |     NAVIGATION_WITH_KEYS: false,
11 |     SHOW_SEARCH_SUMMARY: true,
12 |     ENABLE_SEARCH_SHORTCUTS: true,
13 | };
```

--------------------------------------------------------------------------------
/docs/_sources/guides.rst.txt:
--------------------------------------------------------------------------------

```
 1 | Setup Guides
 2 | ============
 3 | 
 4 | This section contains detailed setup guides for mcp-server-webcrawl in various environments and configurations.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 1
 8 |    :caption: Available Guides:
 9 | 
10 |    guides/archivebox
11 |    guides/httrack
12 |    guides/interrobot
13 |    guides/katana
14 |    guides/siteone
15 |    guides/warc
16 |    guides/wget
17 | 
```

--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | from setuptools import setup
 3 | 
 4 | # remove centered image that doesn't render for pypi
 5 | with open("README.md", "r", encoding="utf-8") as src:
 6 |     content = src.read()
 7 | modified_content = re.sub(r'<p align="center">.*?</p>', '', content, flags=re.DOTALL)
 8 | with open("README_pypi.md", "w", encoding="utf-8") as dest:
 9 |     dest.write(modified_content.strip())
10 | 
11 | # set up configuration in pyproject.toml
12 | setup()
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/server.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import sys
 3 | 
 4 | def initialize_mcp_server() -> None:
 5 |     """
 6 |     MCP stdio streams require utf-8 explicitly set for Windows (default cp1252)
 7 |     or internationalized content will fail.
 8 |     """
 9 |     if sys.platform == "win32" and os.environ.get("PYTHONIOENCODING") is None:
10 |         sys.stdin.reconfigure(encoding="utf-8")
11 |         sys.stdout.reconfigure(encoding="utf-8")
12 |         sys.stderr.reconfigure(encoding="utf-8")
13 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.templates.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.templates package
 2 | =======================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.templates.tests module
 8 | --------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.templates.tests
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.templates
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/settings.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | # DEBUG overrides LOG_LEVEL
 5 | DEBUG: bool = False
 6 | DATA_DIRECTORY: Path = Path.home() / ".mcp_server_webcrawl"
 7 | 
 8 | # fixtures directory (optional, for running tests)
 9 | FIXTURES_DIRECTORY: Path | None = None
10 | 
11 | # logging.NOTSET will not write to a log file, all other levels will
12 | # LOG_LEVEL: int = logging.ERROR
13 | 
14 | # LOG_PATH will automatically fallback to DATA_DIRECTORY / mcp-server-webcrawl.log
15 | # LOG_PATH: Path = Path.home() / "Desktop" / "mcpdemo" / "server_log.txt"
16 | 
17 | try:
18 |     from .settings_local import *
19 | except ImportError:
20 |     pass
21 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers package
 2 | ======================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    mcp_server_webcrawl.crawlers.base
11 |    mcp_server_webcrawl.crawlers.archivebox
12 |    mcp_server_webcrawl.crawlers.httrack
13 |    mcp_server_webcrawl.crawlers.interrobot
14 |    mcp_server_webcrawl.crawlers.katana
15 |    mcp_server_webcrawl.crawlers.siteone
16 |    mcp_server_webcrawl.crawlers.warc
17 |    mcp_server_webcrawl.crawlers.wget
18 | 
19 | Module contents
20 | ---------------
21 | 
22 | .. automodule:: mcp_server_webcrawl.crawlers
23 |    :members:
24 |    :undoc-members:
25 |    :show-inheritance:
26 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.models.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.models package
 2 | ====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.models.resources module
 8 | ---------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.models.resources
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.models.sites module
16 | -----------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.models.sites
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.models
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/main.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp.server.stdio import stdio_server
 4 | 
 5 | from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
 6 | from mcp_server_webcrawl.utils.logger import get_logger, initialize_logger
 7 | from mcp_server_webcrawl.utils.server import initialize_mcp_server
 8 | 
 9 | logger = get_logger()
10 | 
11 | async def main(crawler: BaseCrawler, datasrc: Path):
12 |     initialize_logger()
13 |     initialize_mcp_server()
14 |     async with stdio_server() as (read_stream, write_stream):
15 |         crawler = crawler(datasrc)
16 |         logger.info(f"MCP webcrawl server initialized with adapter {crawler.__class__.__name__}")
17 |         logger.info(f"datasrc: {datasrc.absolute()}")
18 |         await crawler.serve(read_stream, write_stream)
19 |         logger.info("MCP webcrawl server exited")
20 | 
```

--------------------------------------------------------------------------------
/sphinx/make.bat:
--------------------------------------------------------------------------------

```
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
 4 | from mcp_server_webcrawl.crawlers.katana.adapter import get_sites, get_resources
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | class KatanaCrawler(IndexedCrawler):
10 |     """
11 |     A crawler implementation for HTTP text files.
12 |     Provides functionality for accessing and searching web content from captured HTTP exchanges.
13 |     """
14 | 
15 |     def __init__(self, datasrc: Path):
16 |         """
17 |         Initialize the HTTP text crawler with a data source directory.
18 | 
19 |         Args:
20 |             datasrc: The input argument as Path, it must be a directory containing
21 |                 subdirectories with HTTP text files
22 |         """
23 |         super().__init__(datasrc, get_sites, get_resources)
24 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | def to_isoformat_zulu(dt: datetime):
 6 |     """
 7 |     Convert datetime to iso Z.
 8 | 
 9 |     python<=3.10 struggles with Z and fractions of seconds, will
10 |     throw. smooth out the iso string, second precision isn't key here
11 |     """
12 |     return dt.isoformat().replace("+00:00", "Z")
13 | 
14 | def from_isoformat_zulu(dt_string: str | None) -> datetime:
15 |     """
16 |     Convert ISO string to datetime.
17 | 
18 |     python<=3.10 struggles with Z and fractions of seconds, will
19 |     throw. smooth out the iso string, second precision isn't key here
20 |     """
21 | 
22 |     if not dt_string:
23 |         return None
24 |     dt_string = dt_string.replace("Z", "+00:00")
25 |     match = re.match(r"(.*\.\d{6})\d*([-+]\d{2}:\d{2}|$)", dt_string)
26 |     if match:
27 |         dt_string = match.group(1) + (match.group(2) or "")
28 |     return datetime.fromisoformat(dt_string)
29 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.warc.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.warc package
 2 | ===========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.warc.adapter module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.warc.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.warc.crawler module
16 | --------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.warc.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.warc.tests module
24 | ------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.warc.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.warc
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.wget.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.wget package
 2 | ===========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.wget.adapter module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.wget.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.wget.crawler module
16 | --------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.wget.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.wget.tests module
24 | ------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.wget.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.wget
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_static/js/badge_only.js:
--------------------------------------------------------------------------------

```javascript
1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}});
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.katana.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.katana package
 2 | =============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.katana.adapter module
 8 | ----------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.katana.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.katana.crawler module
16 | ----------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.katana.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.katana.tests module
24 | --------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.katana.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.katana
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.httrack.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.httrack package
 2 | ==============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.httrack.adapter module
 8 | -----------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.httrack.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.httrack.crawler module
16 | -----------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.httrack.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.httrack.tests module
24 | ---------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.httrack.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.httrack
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.siteone.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.siteone package
 2 | ==============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.siteone.adapter module
 8 | -----------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.siteone.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.siteone.crawler module
16 | -----------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.siteone.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.siteone.tests module
24 | ---------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.siteone.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.siteone
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.archivebox.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.archivebox package
 2 | =================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.archivebox.adapter module
 8 | --------------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.archivebox.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.archivebox.crawler module
16 | --------------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.archivebox.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.archivebox.tests module
24 | ------------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.archivebox.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.archivebox
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.interrobot.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.interrobot package
 2 | =================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.interrobot.adapter module
 8 | --------------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.interrobot.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.interrobot.crawler module
16 | --------------------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.interrobot.crawler
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.interrobot.tests module
24 | ------------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.interrobot.tests
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.interrobot
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.utils.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.utils package
 2 | ===================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.utils.cli module
 8 | --------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.utils.cli
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.utils.logger module
16 | -----------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.utils.logger
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.utils.server module
24 | -----------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.utils.server
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | mcp\_server\_webcrawl.utils.tools module
32 | ----------------------------------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.utils.tools
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: mcp_server_webcrawl.utils
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
 4 | from mcp_server_webcrawl.crawlers.warc.adapter import get_sites, get_resources
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | class WarcCrawler(IndexedCrawler):
10 |     """
11 |     A crawler implementation for WARC (Web ARChive) files.
12 |     Provides functionality for accessing and searching web archive content.
13 |     """
14 | 
15 |     def __init__(self, datasrc: Path):
16 |         """
17 |         Initialize the WARC crawler with a data source directory.
18 |         Supported file types: .txt, .warc, and .warc.gz
19 | 
20 |         Args:
21 |             datasrc: the input argument as Path, must be a directory containing WARC files
22 | 
23 | 
24 |         Raises:
25 |             AssertionError: If datasrc is None or not a directory
26 |         """
27 |         assert datasrc is not None, f"WarcCrawler needs a datasrc, regardless of action"
28 |         assert datasrc.is_dir(), "WarcCrawler datasrc must be a directory"
29 |         super().__init__(datasrc, get_sites, get_resources)
30 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
 4 | from mcp_server_webcrawl.crawlers.wget.adapter import get_sites, get_resources
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | class WgetCrawler(IndexedCrawler):
10 |     """
11 |     A crawler implementation for wget captured sites.
12 |     Provides functionality for accessing and searching web content from wget captures.
13 |     """
14 | 
15 |     def __init__(self, datasrc: Path):
16 |         """
17 |         Initialize the wget crawler with a data source directory.
18 | 
19 |         Args:
20 |             datasrc: the input argument as Path, it must be a directory containing
21 |                 wget captures organized as subdirectories
22 | 
23 |         Raises:
24 |             AssertionError: If datasrc is None or not a directory
25 |         """
26 |         assert datasrc is not None, f"WgetCrawler needs a datasrc, regardless of action"
27 |         assert datasrc.is_dir(), "WgetCrawler datasrc must be a directory"
28 | 
29 |         super().__init__(datasrc, get_sites, get_resources)
30 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl package
 2 | =============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    mcp_server_webcrawl.crawlers
11 |    mcp_server_webcrawl.extras
12 |    mcp_server_webcrawl.interactive
13 |    mcp_server_webcrawl.models
14 |    mcp_server_webcrawl.templates
15 |    mcp_server_webcrawl.utils
16 | 
17 | Submodules
18 | ----------
19 | 
20 | mcp\_server\_webcrawl.main module
21 | ---------------------------------
22 | 
23 | .. automodule:: mcp_server_webcrawl.main
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :noindex:
28 | 
29 | mcp\_server\_webcrawl.settings module
30 | -------------------------------------
31 | 
32 | .. automodule:: mcp_server_webcrawl.settings
33 |    :members:
34 |    :undoc-members:
35 |    :show-inheritance:
36 | 
37 | mcp\_server\_webcrawl.settings\_local module
38 | --------------------------------------------
39 | 
40 | .. automodule:: mcp_server_webcrawl.settings_local
41 |    :members:
42 |    :undoc-members:
43 |    :show-inheritance:
44 | 
45 | Module contents
46 | ---------------
47 | 
48 | .. automodule:: mcp_server_webcrawl
49 |    :members:
50 |    :undoc-members:
51 |    :show-inheritance:
52 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/base.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Union
 2 | from datetime import datetime
 3 | from pathlib import Path
 4 | 
 5 | METADATA_VALUE_TYPE = Union[str, int, float, bool, datetime, Path, dict, list, None]
 6 | 
 7 | class BaseModel:
 8 | 
 9 |     def to_forcefield_dict(self, forcefields: list[str]) -> dict[str, METADATA_VALUE_TYPE]:
10 |         """
11 |         Convert the object to a dictionary with specified fields forced to exist.
12 | 
13 |         Creates a dictionary that includes all non-None values from the forcefields list,
14 |         and ensuring all fields in the forcefields list exist, even if null.
15 | 
16 |         Args:
17 |             forcefields: list of field names that must appear in the output dictionary
18 |                 with at least a None value
19 | 
20 |         Returns:
21 |             Dictionary containing all non-None object attributes, plus forced fields
22 |             set to None if not already present
23 |         """
24 |         # None self-annihilates in filter, forcefields can force their existence, as null
25 |         result = {}
26 |         if forcefields:
27 |             result = {k: None for k in forcefields}
28 |         result.update(self.to_dict())
29 |         return result
30 | 
```

--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------

```toml
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "mcp-server-webcrawl"
 7 | description = "MCP server for search and retrieval of web crawler content"
 8 | readme = "README_pypi.md"
 9 | requires-python = ">=3.10"
10 | authors = [
11 |     {name = "Ben Caulfield"}
12 | ]
13 | classifiers = [
14 |     "Programming Language :: Python :: 3",
15 |     "Operating System :: OS Independent",
16 |     "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
17 | ]
18 | dependencies = [
19 |     "mcp>=1.3.0",
20 |     "lxml>=4.6.0",
21 |     "Pillow>=9.0.0",
22 |     "aiohttp>=3.8.0",
23 |     "warcio>=1.7.0",
24 |     "ply==3.11",
25 |     "windows-curses; sys_platform == 'win32'"
26 | ]
27 | dynamic = ["version"]
28 | 
29 | [project.scripts]
30 | mcp-server-webcrawl = "mcp_server_webcrawl:main"
31 | 
32 | [project.urls]
33 | Homepage = "https://pragmar.com/mcp-server-webcrawl/"
34 | 
35 | [tool.setuptools]
36 | package-dir = {"" = "src"}
37 | 
38 | [tool.setuptools.packages.find]
39 | where = ["src"]
40 | include = ["mcp_server_webcrawl*"]
41 | exclude = ["*settings_local*"]
42 | 
43 | [tool.setuptools.dynamic]
44 | version = {attr = "mcp_server_webcrawl.__version__"}
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.interactive.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.interactive package
 2 | =========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.interactive.highlights module
 8 | ---------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.interactive.highlights
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.interactive.search module
16 | -----------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.interactive.search
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.interactive.session module
24 | ------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.interactive.session
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | mcp\_server\_webcrawl.interactive.ui module
32 | -------------------------------------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.interactive.ui
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: mcp_server_webcrawl.interactive
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
 4 | from mcp_server_webcrawl.crawlers.siteone.adapter import get_sites, get_resources
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | class SiteOneCrawler(IndexedCrawler):
10 |     """
11 |     A crawler implementation for SiteOne captured sites.
12 |     Provides functionality for accessing and searching web content from SiteOne captures.
13 |     SiteOne merges a wget archive with a custom SiteOne generated log to aquire more
14 |     fields than wget can alone.
15 |     """
16 | 
17 |     def __init__(self, datasrc: Path):
18 |         """
19 |         Initialize the SiteOne crawler with a data source directory.
20 | 
21 |         Args:
22 |             datasrc: The input argument as Path, it must be a directory containing
23 |                 SiteOne captures organized as subdirectories
24 | 
25 |         Raises:
26 |             AssertionError: If datasrc is None or not a directory
27 |         """
28 |         assert datasrc is not None, f"SiteOneCrawler needs a datasrc, regardless of action"
29 |         assert datasrc.is_dir(), "SiteOneCrawler datasrc must be a directory"
30 | 
31 |         super().__init__(datasrc, get_sites, get_resources)
32 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
 4 | from mcp_server_webcrawl.crawlers.httrack.adapter import get_sites, get_resources
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | class HtTrackCrawler(IndexedCrawler):
10 |     """
11 |     A crawler implementation for HTTrack captured sites.
12 |     Provides functionality for accessing and searching web content from HTTrack projects.
13 |     HTTrack creates offline mirrors of websites with preserved directory structure
14 |     and metadata in hts-log.txt files.
15 |     """
16 | 
17 |     def __init__(self, datasrc: Path):
18 |         """
19 |         Initialize the HTTrack crawler with a data source directory.
20 | 
21 |         Args:
22 |             datasrc: The input argument as Path, it must be a directory containing
23 |                 HTTrack project directories, each potentially containing multiple domains
24 | 
25 |         Raises:
26 |             AssertionError: If datasrc is None or not a directory
27 |         """
28 |         assert datasrc is not None, f"HtTrackCrawler needs a datasrc, regardless of action"
29 |         assert datasrc.is_dir(), "HtTrackCrawler datasrc must be a directory"
30 | 
31 |         super().__init__(datasrc, get_sites, get_resources)
32 | 
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.extras.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.extras package
 2 | ====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.extras.markdown module
 8 | --------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.extras.markdown
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.extras.regex module
16 | -----------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.extras.regex
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.extras.snippets module
24 | --------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.extras.snippets
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | mcp\_server\_webcrawl.extras.thumbnails module
32 | ----------------------------------------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.extras.thumbnails
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | mcp\_server\_webcrawl.extras.xpath module
40 | -----------------------------------------
41 | 
42 | .. automodule:: mcp_server_webcrawl.extras.xpath
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: mcp_server_webcrawl.extras
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedCrawler
 4 | from mcp_server_webcrawl.crawlers.archivebox.adapter import get_sites, get_resources
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | class ArchiveBoxCrawler(IndexedCrawler):
10 |     """
11 |     A crawler implementation for ArchiveBox archived sites.
12 |     Provides functionality for accessing and searching web content from ArchiveBox archives.
13 |     ArchiveBox creates single-URL archives with metadata stored in JSON files
14 |     and HTML content preserved in index.html files.
15 |     """
16 | 
17 |     def __init__(self, datasrc: Path):
18 |         """
19 |         Initialize the ArchiveBox crawler with a data source directory.
20 | 
21 |         Args:
22 |             datasrc: The input argument as Path, it must be a directory containing
23 |                 ArchiveBox archive directories, each containing individual URL entries
24 | 
25 |         Raises:
26 |             AssertionError: If datasrc is None or not a directory
27 |         """
28 |         assert datasrc is not None, f"ArchiveBoxCrawler needs a datasrc, regardless of action"
29 |         assert datasrc.is_dir(), "ArchiveBoxCrawler datasrc must be a directory"
30 | 
31 |         super().__init__(datasrc, get_sites, get_resources)
```

--------------------------------------------------------------------------------
/docs/_sources/mcp_server_webcrawl.crawlers.base.rst.txt:
--------------------------------------------------------------------------------

```
 1 | mcp\_server\_webcrawl.crawlers.base package
 2 | ===========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mcp\_server\_webcrawl.crawlers.base.adapter module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: mcp_server_webcrawl.crawlers.base.adapter
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | mcp\_server\_webcrawl.crawlers.base.api module
16 | ----------------------------------------------
17 | 
18 | .. automodule:: mcp_server_webcrawl.crawlers.base.api
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | mcp\_server\_webcrawl.crawlers.base.crawler module
24 | --------------------------------------------------
25 | 
26 | .. automodule:: mcp_server_webcrawl.crawlers.base.crawler
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | mcp\_server\_webcrawl.crawlers.base.indexed module
32 | --------------------------------------------------
33 | 
34 | .. automodule:: mcp_server_webcrawl.crawlers.base.indexed
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | mcp\_server\_webcrawl.crawlers.base.tests module
40 | ------------------------------------------------
41 | 
42 | .. automodule:: mcp_server_webcrawl.crawlers.base.tests
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: mcp_server_webcrawl.crawlers.base
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
```

--------------------------------------------------------------------------------
/sphinx/conf.py:
--------------------------------------------------------------------------------

```python
 1 | # to build docs
 2 | # mcp_server_webcrawl> sphinx-apidoc -o sphinx src/mcp_server_webcrawl 
 3 | # mcp_server_webcrawl> cd .\sphinx\
 4 | # mcp_server_webcrawl\sphinx> sphinx-build -b html . ../docs
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | # path the src directory for autodiscovery
10 | sys.path.insert(0, os.path.abspath("../../src"))
11 | 
12 | project = "mcp-server-webcrawl"
13 | copyright = "2025, pragmar"
14 | author = "pragmar"
15 | 
16 | extensions = [
17 |     "sphinx.ext.autodoc",
18 |     "sphinx.ext.viewcode",
19 |     "sphinx.ext.napoleon",
20 |     "sphinx.ext.coverage",
21 |     "sphinx.ext.intersphinx",
22 | ]
23 | 
24 | templates_path = ["_templates"]
25 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**/settings_local.py"]
26 | 
27 | 
28 | html_theme = "sphinx_rtd_theme"
29 | html_static_path = ["_static"]
30 | html_theme_options = {
31 |     "navigation_depth": 2,  # This is the key setting to prevent deep nesting
32 |     "titles_only": False,
33 | }
34 | # Better autodoc formatting
35 | autodoc_default_options = {
36 |     "members": True,
37 |     "undoc-members": True,
38 |     "show-inheritance": False,
39 |     "member-order": "bysource",
40 | }
41 | 
42 | autodoc_member_order = "groupwise"
43 | autodoc_typehints = "description"
44 | autodoc_class_signature = "separated"
45 | autodoc_mock_imports = ["mcp_server_webcrawl.settings_local"]
46 | add_module_names = False
47 | autoclass_content = "both"
48 | 
49 | 
50 | intersphinx_mapping = {
51 |     "python": ("https://docs.python.org/3", None),
52 | }
53 | 
54 | 
55 | 
56 | toctree_maxdepth = 2
57 | 
```

--------------------------------------------------------------------------------
/docs/_sources/interactive.rst.txt:
--------------------------------------------------------------------------------

```
 1 | Interactive Mode
 2 | ================
 3 | 
 4 | **No AI, just classic Boolean search of your web-archives in a terminal.**
 5 | 
 6 | mcp-server-webcrawl can double as a terminal search for your web archives. You can run it against your local archives, but it gets more interesting when you realize you can ssh into any remote host and view archives sitting on that host. No downloads, syncs, multifactor logins, or other common drudgery required. With interactive mode, you can be in and searching a crawl sitting on a remote server in no time at all.
 7 | 
 8 | .. raw:: html
 9 | 
10 |    <iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/8kNkP-zNzs4" frameborder="0" allowfullscreen></iframe>
11 | 
12 | Usage
13 | -----
14 | 
15 | Interactive mode exposes the mcp-server-webcrawl search layer as a terminal UI (TUI), bypassing MCP/AI altogether. Core field and Boolean search are supported, along with the human-friendly aspects of the search interface, such as result snippets.
16 | 
17 | You launch interactive mode from the termial, using the --interactive command line argument.
18 | 
19 | .. code-block:: bash
20 | 
21 |    mcp-server-webcrawl --crawler wget --datasrc /path/to/datasrc --interactive
22 |    # or manually enter crawler and datasrc in the UI
23 |    mcp-server-webcrawl --interactive
24 | 
25 | Screencaps
26 | ----------
27 | 
28 | .. figure:: _static/images/interactive.search.webp
29 |    :alt: mcp-server-webcrawl in --interactive mode heading
30 |    :align: center
31 |    :width: 100%
32 | 
33 |    Search view, showing snippets with "Solar Eclipse" highlights
34 | 
35 | .. figure:: _static/images/interactive.document.webp
36 |    :alt: mcp-server-webcrawl in --interactive mode heading
37 |    :align: center
38 |    :width: 100%
39 | 
40 |    Document presentated in in Markdown, with raw and HTTP headers views available.
```

--------------------------------------------------------------------------------
/docs/_sources/installation.rst.txt:
--------------------------------------------------------------------------------

```
 1 | Installation
 2 | ============
 3 | 
 4 | Install the package via pip:
 5 | 
 6 | .. code-block:: bash
 7 | 
 8 |    pip install mcp-server-webcrawl
 9 | 
10 | Requirements
11 | ------------
12 | 
13 | To use mcp-server-webcrawl effectively, you need:
14 | 
15 | * An MCP-capable LLM host such as Claude Desktop [1]
16 | * Python [2] installed on your command line interface
17 | * Basic familiarity with running Python packages
18 | 
19 | After ensuring these prerequisites are met, run the pip install command above to add the package to your environment.
20 | 
21 | MCP Configuration
22 | -----------------
23 | 
24 | To enable your LLM host to access your web crawl data, you'll need to add an MCP server configuration. From Claude's developer settings, locate the MCP configuration section and add the appropriate configuration for your crawler type.
25 | 
26 | Setup guides and videos are available for each supported crawler:
27 | 
28 | * :doc:`ArchiveBox <guides/archivebox>`
29 | * :doc:`HTTrack <guides/httrack>`
30 | * :doc:`InterroBot <guides/interrobot>`
31 | * :doc:`Katana <guides/katana>`
32 | * :doc:`SiteOne <guides/siteone>`
33 | * :doc:`WARC <guides/warc>`
34 | * :doc:`Wget <guides/wget>`
35 | 
36 | Multiple Configurations
37 | -----------------------
38 | 
39 | You can set up multiple **mcp-server-webcrawl** connections under the ``mcpServers`` section if you want to access different crawler data sources simultaneously.
40 | 
41 | .. code-block:: json
42 | 
43 |    {
44 |      "mcpServers": {
45 |        "webcrawl_warc": {
46 |          "command": "/path/to/mcp-server-webcrawl",
47 |           "args": ["--crawler", "warc", "--datasrc", "/path/to/warc/archives/"]
48 |        },
49 |        "webcrawl_wget": {
50 |          "command": "/path/to/mcp-server-webcrawl",
51 |           "args": ["--crawler", "wget", "--datasrc", "/path/to/wget/archives/"]
52 |        }
53 |      }
54 |    }
55 | 
56 | After adding the configuration, save the file and restart your LLM host to apply the changes.
57 | 
58 | References
59 | ----------
60 | 
61 | [1] Claude Desktop: https://claude.ai
62 | [2] Python: https://python.org
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/markdown.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | from importlib import resources
 3 | from typing import Final, Any
 4 | from lxml import etree, html
 5 | from lxml.etree import ParserError
 6 | from logging import Logger
 7 | 
 8 | from mcp_server_webcrawl.utils.logger import get_logger
 9 | 
10 | __XSLT_RESULT_CLEANER: Final[re.Pattern] = re.compile(r"(?:\n\s*-\s*\n|\n\s*\n)+")
11 | __RE_HTML: Final[re.Pattern] = re.compile(r"<[a-zA-Z]+[^>]*>")
12 | 
13 | logger: Logger = get_logger()
14 | 
15 | class MarkdownTransformer:
16 |     """
17 |     Memoizes the XSLT transformer
18 |     """
19 |     _xslt_transform = None
20 | 
21 |     @classmethod
22 |     def get_xslt_transform(cls):
23 |         """
24 |         Get the HTML to text markdown XSLT transformer
25 |         """
26 |         if cls._xslt_transform is None:
27 |             xslt_string: str = resources.read_text("mcp_server_webcrawl.templates", "markdown.xslt").encode("utf-8")
28 |             xslt_doc = etree.fromstring(xslt_string)
29 |             cls._xslt_transform = etree.XSLT(xslt_doc)
30 |         return cls._xslt_transform
31 | 
32 | def get_markdown(content: str) -> str | None:
33 |     """
34 |     Transform HTML content to Markdown using XSLT.
35 | 
36 |     Args:
37 |         content (str): The HTML content to transform.
38 | 
39 |     Returns:
40 |         str | None: The transformed Markdown string, or None if the input is empty
41 |             or if transformation fails (e.g., due to invalid HTML or XSLT errors).
42 |     """
43 | 
44 |     transformer = MarkdownTransformer.get_xslt_transform()
45 |     content:str = content or ""
46 |     assert isinstance(content, str), "String (HTML) required for transformer"
47 |     assert transformer is not None
48 | 
49 |     if content == "" or not __RE_HTML.search(content):
50 |         return None
51 | 
52 |     try:
53 |         doc = html.fromstring(content)
54 |         result = str(transformer(doc))
55 |         result = __XSLT_RESULT_CLEANER.sub("\n\n", result).strip()
56 |         return result
57 | 
58 |     except Exception as ex:
59 |         logger.warning(f"XSLT transform error: {type(ex).__name__}\n{ex}")
60 |         return None
61 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | 
 2 | import sys
 3 | from pathlib import Path
 4 | from mcp_server_webcrawl.settings import FIXTURES_DIRECTORY
 5 | 
 6 | VALID_CRAWLER_CHOICES: list[str] = ["archivebox", "httrack", "interrobot", "katana", "siteone", "warc", "wget"]
 7 | 
 8 | def get_fixture_directory() -> Path:
 9 |     # only to be used for devs on test runs, configured in settings_local.py
10 |     # settings_local.py added as sibling of settings.py if not present
11 |     # download https://github.com/pragmar/mcp-server-webcrawl-fixtures
12 |     assert FIXTURES_DIRECTORY is not None and FIXTURES_DIRECTORY.is_dir(), \
13 |         f"Fixtures not configured in settings_local.py, or is not a valid directory.\nFIXTURES_DIRECTORY: {FIXTURES_DIRECTORY}"
14 | 
15 |     return FIXTURES_DIRECTORY
16 | 
17 | def get_crawler(crawler_name: str) -> str | None:
18 |     """
19 |     lazy load crawler, some classes have additional package dependencies
20 |     """
21 |     if crawler_name is None:
22 |         return None
23 |     crawler_name = crawler_name.lower()
24 |     if crawler_name == "archivebox":
25 |         from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
26 |         return ArchiveBoxCrawler
27 |     elif crawler_name == "httrack":
28 |         from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler
29 |         return HtTrackCrawler
30 |     elif crawler_name == "interrobot":
31 |         from mcp_server_webcrawl.crawlers.interrobot.crawler import InterroBotCrawler
32 |         return InterroBotCrawler
33 |     elif crawler_name == "katana":
34 |         from mcp_server_webcrawl.crawlers.katana.crawler import KatanaCrawler
35 |         return KatanaCrawler
36 |     elif crawler_name == "siteone":
37 |         from mcp_server_webcrawl.crawlers.siteone.crawler import SiteOneCrawler
38 |         return SiteOneCrawler
39 |     elif crawler_name == "warc":
40 |         from mcp_server_webcrawl.crawlers.warc.crawler import WarcCrawler
41 |         return WarcCrawler
42 |     elif crawler_name == "wget":
43 |         from mcp_server_webcrawl.crawlers.wget.crawler import WgetCrawler
44 |         return WgetCrawler
45 |     else:
46 |         valid_choices = ", ".join(VALID_CRAWLER_CHOICES)
47 |         raise ValueError(f"unsupported crawler '{crawler_name}' ({valid_choices})")
48 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/xpath.py:
--------------------------------------------------------------------------------

```python
 1 | import lxml.html
 2 | 
 3 | from lxml import etree
 4 | from lxml.etree import ParserError, XPathEvalError, XPathSyntaxError
 5 | from logging import Logger
 6 | 
 7 | from mcp_server_webcrawl.utils.logger import get_logger
 8 | 
 9 | logger: Logger = get_logger()
10 | 
11 | def get_xpath(content: str, xpaths: list[str]) -> list[dict[str, str | int | float]]:
12 |     """
13 |     Takes content and gets xpath hits
14 | 
15 |     Arguments:
16 |         content: The HTML source
17 |         xpaths: The xpath selectors
18 | 
19 |     Returns:
20 |         A list of dicts, with selector and value
21 |     """
22 | 
23 |     if not isinstance(content, str):
24 |         return []
25 | 
26 |     if not isinstance(xpaths, list) or not all(isinstance(item, str) for item in xpaths):
27 |         raise ValueError("xpaths must be a list of strings")
28 | 
29 |     results = []
30 | 
31 |     if content == "":
32 |         return results
33 | 
34 |     try:
35 |         doc: lxml.html.HtmlElement = lxml.html.fromstring(content.encode("utf-8"))
36 |     except ParserError:
37 |         return results
38 | 
39 |     for xpath in xpaths:
40 |         try:
41 |             selector_result = doc.xpath(xpath)
42 |         except (XPathEvalError, XPathSyntaxError) as ex:
43 |             logger.warning(f"Invalid xpath '{xpath}': {ex}")
44 |             continue
45 | 
46 |         if isinstance(selector_result, (list, tuple)):
47 |             # normal xpath query returns a list
48 |             for result in selector_result:
49 |                 # a new dict for each result
50 |                 xpath_hit: dict[str, str | int | float] = {"selector": xpath}
51 |                 if hasattr(result, "tag"):
52 |                     html_string: str = etree.tostring(result, encoding="unicode", method="html")
53 |                     xpath_hit["value"] = html_string.strip()
54 |                 else:
55 |                     xpath_hit["value"] = str(result).strip()
56 |                 results.append(xpath_hit)
57 |         else:
58 |             # single value case (count(//h1), sum(), etc.) is also valid xpath
59 |             xpath_hit: dict[str, str | int | float] = {"selector": xpath}
60 |             if isinstance(selector_result, (int, float)):
61 |                 xpath_hit["value"] = selector_result
62 |             else:
63 |                 xpath_hit["value"] = str(selector_result).strip()
64 |             results.append(xpath_hit)
65 | 
66 |     return results
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/sites.py:
--------------------------------------------------------------------------------

```python
 1 | from datetime import datetime
 2 | from typing import Final
 3 | from pathlib import Path
 4 | from enum import Enum
 5 | 
 6 | from mcp_server_webcrawl.models.base import BaseModel, METADATA_VALUE_TYPE
 7 | from mcp_server_webcrawl.utils import to_isoformat_zulu
 8 | 
 9 | class SiteType(Enum):
10 |     UNDEFINED = "undefined"
11 |     CRAWLED_URL = "url"
12 |     CRAWLED_LIST = "list"
13 | 
14 | SITES_TOOL_NAME: Final[str] = "webcrawl_sites"
15 | SITES_FIELDS_BASE: Final[list[str]] = ["id", "name", "type", "urls"]
16 | SITES_FIELDS_DEFAULT: Final[list[str]] = SITES_FIELDS_BASE + ["created", "modified"]
17 | 
18 | class SiteResult(BaseModel):
19 |     """
20 |     Represents a website or crawl directory result.
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         id: int,
26 |         name: str | None = None,
27 |         type: SiteType = SiteType.CRAWLED_URL,
28 |         urls: list[str] | None = None,
29 |         path: Path = None,
30 |         created: datetime | None = None,
31 |         modified: datetime | None = None,
32 |         robots: str | None = None,
33 |         metadata: dict[str, METADATA_VALUE_TYPE] | None = None
34 |     ):
35 |         """
36 |         Initialize a SiteResult instance.
37 | 
38 |         Args:
39 |             id: site identifier
40 |             name: site name, either a URL or a custom job
41 |             urls: site URL(s), multiple for list type crawls
42 |             path: path to site data, different from datasrc
43 |             created: creation timestamp
44 |             modified: last modification timestamp
45 |             robots: robots.txt content
46 |             metadata: additional metadata for the site
47 |         """
48 |         self.id = id
49 |         self.name = name
50 |         self.type = type
51 |         self.urls = urls
52 |         self.path = path
53 |         self.created = created
54 |         self.modified = modified
55 |         self.robots = robots
56 |         self.metadata = metadata or {}
57 | 
58 |     def to_dict(self) -> dict[str, METADATA_VALUE_TYPE]:
59 |         """
60 |         Convert the object to a dictionary suitable for JSON serialization.
61 |         """
62 |         result: dict[str, METADATA_VALUE_TYPE] = {
63 |             "id": self.id,
64 |             "name": self.name,
65 |             "type": self.type.value,
66 |             "urls": self.urls,
67 |             "created": to_isoformat_zulu(self.created) if self.created else None,
68 |             "modified": to_isoformat_zulu(self.modified) if self.modified else None,
69 |             "metadata": self.metadata if self.metadata else None,
70 |         }
71 | 
72 |         return {k: v for k, v in result.items() if v is not None and not (k == "metadata" and v == {})}
73 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/tests.py:
--------------------------------------------------------------------------------

```python
 1 | from mcp_server_webcrawl.crawlers import get_fixture_directory
 2 | from mcp_server_webcrawl.crawlers.wget.adapter import WgetManager
 3 | from mcp_server_webcrawl.crawlers.wget.crawler import WgetCrawler
 4 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | EXAMPLE_SITE_ID = WgetManager.string_to_id("example.com")
10 | PRAGMAR_SITE_ID = WgetManager.string_to_id("pragmar.com")
11 | 
12 | class WgetTests(BaseCrawlerTests):
13 |     """
14 |     Test suite for the wget crawler implementation.
15 |     Uses all wrapped test methods from BaseCrawlerTests.
16 |     """
17 | 
18 |     def setUp(self):
19 |         """
20 |         Set up the test environment with fixture data.
21 |         """
22 |         super().setUp()
23 |         self._datasrc = get_fixture_directory() / "wget"
24 | 
25 |     def test_wget_pulse(self):
26 |         """
27 |         Test basic crawler initialization.
28 |         """
29 |         crawler = WgetCrawler(self._datasrc)
30 |         self.assertIsNotNone(crawler)
31 |         self.assertTrue(self._datasrc.is_dir())
32 | 
33 |     def test_wget_sites(self):
34 |         """
35 |         Test site retrieval API functionality.
36 |         """
37 |         crawler = WgetCrawler(self._datasrc)
38 |         self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
39 | 
40 |     def test_wget_search(self):
41 |         """
42 |         Test boolean search functionality
43 |         """
44 |         # moved fixtures to own repo, lost some local media,
45 |         # but checks out. wget fixture has no CSS/JS/etc.
46 |         # HTML-only and just doesn't do well with the full array of
47 |         # tests concerning fulltext, media, and mixed search result
48 |         # counts. probably needs a reduced set of tests
49 |         # self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
50 |         return
51 | 
52 |     def test_wget_resources(self):
53 |         """
54 |         Test resource retrieval API functionality with various parameters.
55 |         """
56 |         crawler = WgetCrawler(self._datasrc)
57 |         self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
58 | 
59 | 
60 |     def test_wget_sorts(self):
61 |         """
62 |         Test random sort functionality using the '?' sort parameter.
63 |         """
64 |         crawler = WgetCrawler(self._datasrc)
65 |         self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
66 | 
67 |     def test_wget_content_parsing(self):
68 |         """
69 |         Test content type detection and parsing.
70 |         """
71 |         crawler = WgetCrawler(self._datasrc)
72 |         self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
73 | 
74 |     def test_report(self):
75 |         """
76 |         Run test report, save to data directory.
77 |         """
78 |         crawler = WgetCrawler(self._datasrc)
79 |         logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "wget"))
80 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/logger.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | from pathlib import Path
 3 | from typing import Final
 4 | 
 5 | import mcp_server_webcrawl.settings as settings
 6 | from mcp_server_webcrawl.settings import DEBUG, DATA_DIRECTORY
 7 | 
 8 | DEFAULT_LOG_KEY: Final[str] = "mcp-server-webcrawl"
 9 | DEFAULT_LOG_PATH: Final[Path] = DATA_DIRECTORY / "mcp-server-webcrawl.log"
10 | DEFAULT_LOG_LEVEL: Final[int] = logging.WARNING
11 | 
12 | def get_logger_configuration() -> tuple[str, Path, int]:
13 |     """
14 |     Get log name, path, and level (in that order)
15 | 
16 |     Returns:
17 |         tuple[str, Path, int]: A tuple containing name, path, and level
18 |     """
19 | 
20 |     log_path: Path = DEFAULT_LOG_PATH
21 |     log_level: int = DEFAULT_LOG_LEVEL
22 | 
23 |     log_level = logging.DEBUG if DEBUG else getattr(settings, "LOG_LEVEL", DEFAULT_LOG_LEVEL)
24 |     log_path = getattr(settings, "LOG_PATH", DEFAULT_LOG_PATH)
25 |     return (DEFAULT_LOG_KEY, log_path, log_level)
26 | 
27 | def get_logger() -> logging.Logger:
28 |     """
29 |     Get logger, usually in order to write to it
30 | 
31 |     Returns:
32 |         Logger: a writable logging object (error/warn/info/debug)
33 |     """
34 | 
35 |     (log_name, _, _) = get_logger_configuration()
36 |     return logging.getLogger(log_name)
37 | 
38 | def initialize_logger() -> None:
39 |     """
40 |     Validate and set up logger for writing
41 | 
42 |     Returns:
43 |         None
44 |     """
45 | 
46 |     (log_name, log_path, log_level) = get_logger_configuration()
47 |     if log_level == logging.NOTSET:
48 |         # don't set up anything, named logging will effectively evaporate
49 |         return
50 | 
51 |     assert isinstance(log_level, int) and log_level != 0, "LOG_LEVEL must be set"
52 |     assert isinstance(log_path, Path), "LOG_PATH must be a Path object"
53 |     assert isinstance(log_name, str) and log_name.strip() != "", "LOG_NAME must be a non-empty string"
54 |     assert all(c.isalpha() or c in "-_" for c in log_name), "LOG_NAME must contain only A-Z, a-z, hyphens, and underscores"
55 | 
56 |     # handle custom log paths differently, don't generate directories
57 |     if ".mcp_server_webcrawl" in str(log_path):
58 |         log_path.parent.mkdir(parents=True, exist_ok=True)
59 |     else:
60 |         assert log_path.parent.exists() and log_path.parent.is_dir(), \
61 |             f"Custom parent directory `{log_path.parent}` does not exist or is not a directory"
62 | 
63 |     logging.basicConfig(filename=str(log_path), filemode="w", level=log_level,
64 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
65 |         datefmt="%Y-%m-%d %H:%M:%S", encoding="utf-8")
66 | 
67 |     logger: logging.Logger = logging.getLogger(log_name)
68 | 
69 |     # just set a few ops back, concurrent logger might not be ready
70 |     if log_level <= logging.INFO:
71 |         logger.info("🖥️ starting webcrawl MCP server")
72 |         log_extra: str = "(Debug is True)" if DEBUG else ""
73 |         logger.info(f"log level set to {log_level} {log_extra}")
74 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/tests.py:
--------------------------------------------------------------------------------

```python
 1 | from mcp_server_webcrawl.crawlers.warc.crawler import WarcCrawler
 2 | from mcp_server_webcrawl.crawlers.warc.adapter import WarcManager
 3 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
 4 | from mcp_server_webcrawl.crawlers import get_fixture_directory
 5 | from mcp_server_webcrawl.utils.logger import get_logger
 6 | 
 7 | EXAMPLE_WARC_ID: int = WarcManager.string_to_id("example.warc.gz")
 8 | PRAGMAR_WARC_ID: int = WarcManager.string_to_id("pragmar.warc.gz")
 9 | 
10 | logger = get_logger()
11 | 
12 | class WarcTests(BaseCrawlerTests):
13 |     """
14 |     Test suite for the WARC crawler implementation.
15 |     Uses all wrapped test methods from BaseCrawlerTests.
16 |     """
17 | 
18 |     def setUp(self):
19 |         """
20 |         Set up the test environment with fixture data.
21 |         """
22 |         super().setUp()
23 |         self._datasrc = get_fixture_directory() / "warc"
24 | 
25 |     def test_warc_pulse(self):
26 |         """
27 |         Test basic crawler initialization.
28 |         """
29 |         crawler = WarcCrawler(self._datasrc)
30 |         self.assertIsNotNone(crawler)
31 |         self.assertTrue(self._datasrc.is_dir())
32 | 
33 |     def test_warc_sites(self):
34 |         """
35 |         Test site retrieval API functionality.
36 |         """
37 |         crawler = WarcCrawler(self._datasrc)
38 |         self.run_pragmar_site_tests(crawler, PRAGMAR_WARC_ID)
39 | 
40 |     def test_warc_search(self):
41 |         """
42 |         Test boolean search functionality
43 |         """
44 |         crawler = WarcCrawler(self._datasrc)
45 |         self.run_pragmar_search_tests(crawler, PRAGMAR_WARC_ID)
46 | 
47 |     def test_warc_resources(self):
48 |         """
49 |         Test resource retrieval API functionality with various parameters.
50 |         """
51 |         crawler = WarcCrawler(self._datasrc)
52 |         self.run_sites_resources_tests(crawler, PRAGMAR_WARC_ID, EXAMPLE_WARC_ID)
53 | 
54 |     # pragmar WARC fixture legit contains no images
55 |     # may be default behavior of wget WARC gen, not sure
56 |     # this is a blind spot
57 |     # def test_interrobot_images(self):
58 |     #     """
59 |     #     Test InterroBot-specific image handling and thumbnails.
60 |     #     """
61 |     #     crawler = WarcCrawler(self._datasrc)
62 |     #     self.run_pragmar_image_tests(crawler, PRAGMAR_WARC_ID)
63 | 
64 |     def test_warc_sorts(self):
65 |         """
66 |         Test random sort functionality using the '?' sort parameter.
67 |         """
68 |         crawler = WarcCrawler(self._datasrc)
69 |         self.run_pragmar_sort_tests(crawler, PRAGMAR_WARC_ID)
70 | 
71 |     def test_warc_content_parsing(self):
72 |         """
73 |         Test content type detection and parsing for WARC files.
74 |         """
75 |         crawler = WarcCrawler(self._datasrc)
76 |         self.run_pragmar_content_tests(crawler, PRAGMAR_WARC_ID, True)
77 | 
78 |     def test_report(self):
79 |         """
80 |         Run test report, save to data directory.
81 |         """
82 |         crawler = WarcCrawler(self._datasrc)
83 |         logger.info(self.run_pragmar_report(crawler, PRAGMAR_WARC_ID, "WARC"))
84 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/tests.py:
--------------------------------------------------------------------------------

```python
 1 | from logging import Logger
 2 | from mcp_server_webcrawl.crawlers.katana.crawler import KatanaCrawler
 3 | from mcp_server_webcrawl.crawlers.katana.adapter import KatanaManager
 4 | from mcp_server_webcrawl.crawlers.base.adapter import SitesGroup
 5 | from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
 6 | from mcp_server_webcrawl.crawlers import get_fixture_directory
 7 | from mcp_server_webcrawl.utils.logger import get_logger
 8 | 
 9 | # calculate ids for test directories using the same hash function as adapter
10 | EXAMPLE_SITE_ID = KatanaManager.string_to_id("example.com")
11 | PRAGMAR_SITE_ID = KatanaManager.string_to_id("pragmar.com")
12 | 
13 | logger: Logger = get_logger()
14 | 
15 | class KatanaTests(BaseCrawlerTests):
16 |     """
17 |     test suite for the HTTP text crawler implementation.
18 |     tests parsing and retrieval of web content from HTTP text files.
19 |     """
20 | 
21 |     def setUp(self):
22 |         """
23 |         set up the test environment with fixture data.
24 |         """
25 |         super().setUp()
26 |         self._datasrc = get_fixture_directory() / "katana"
27 | 
28 |     def test_katana_pulse(self):
29 |         """
30 |         basic crawler initialization.
31 |         """
32 |         crawler = KatanaCrawler(self._datasrc)
33 |         self.assertIsNotNone(crawler)
34 |         self.assertTrue(self._datasrc.is_dir())
35 | 
36 |     def test_katana_sites(self):
37 |         """
38 |         site retrieval API functionality.
39 |         """
40 |         crawler = KatanaCrawler(self._datasrc)
41 |         self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
42 | 
43 |     def test_katana_search(self):
44 |         """
45 |         boolean search tests
46 |         """
47 |         crawler = KatanaCrawler(self._datasrc)
48 |         self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
49 | 
50 |     def test_pragmar_tokenizer(self):
51 |         """
52 |         tokenizer search tests
53 |         """
54 |         crawler = KatanaCrawler(self._datasrc)
55 |         self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
56 | 
57 | 
58 |     def test_katana_resources(self):
59 |         """
60 |         resource retrieval API functionality with various parameters.
61 |         """
62 |         crawler = KatanaCrawler(self._datasrc)
63 |         self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
64 | 
65 |     def test_interrobot_images(self):
66 |         """
67 |         Test InterroBot-specific image handling and thumbnails.
68 |         """
69 |         crawler = KatanaCrawler(self._datasrc)
70 |         self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
71 | 
72 |     def test_katana_sorts(self):
73 |         """
74 |         random sort functionality using the '?' sort parameter.
75 |         """
76 |         crawler = KatanaCrawler(self._datasrc)
77 |         self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
78 | 
79 |     def test_katana_content_parsing(self):
80 |         """
81 |         content type detection and parsing for HTTP text files.
82 |         """
83 |         crawler = KatanaCrawler(self._datasrc)
84 |         self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
85 | 
86 |     def test_report(self):
87 |         """
88 |         Run test report, save to data directory.
89 |         """
90 |         crawler = KatanaCrawler(self._datasrc)
91 |         logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "Katana"))
92 | 
```

--------------------------------------------------------------------------------
/docs/_sources/index.rst.txt:
--------------------------------------------------------------------------------

```
 1 | .. image:: _static/images/mcpswc.svg
 2 |    :alt: mcp-server-webcrawl heading
 3 |    :align: center
 4 |    :width: 100%
 5 | 
 6 | .. raw:: html
 7 | 
 8 |    <div style="text-align: center; margin-bottom: 2em;">
 9 |      <a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 4px;">Website</a> |
10 |      <a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 4px;">Github</a> |
11 |      <a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 4px;">Docs</a> |
12 |      <a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 4px;">PyPi</a>
13 | 
14 |    </div>
15 | 
16 | mcp-server-webcrawl
17 | ===============================================
18 | 
19 | Advanced search and retrieval for web crawler data. With **mcp-server-webcrawl**, your AI client filters
20 | and analyzes web content under your direction or autonomously. The server includes a full-text search
21 | interface with boolean support, and resource filtering by type, HTTP status, and more.
22 | 
23 | **mcp-server-webcrawl** provides the LLM a complete menu with which to search your web content, and works with
24 | a variety of web crawlers:
25 | 
26 | .. list-table:: Supported Crawlers
27 |    :header-rows: 1
28 |    :widths: 30 50 20
29 | 
30 |    * - Crawler/Format
31 |      - Description
32 |      - Setup Guide
33 |    * - `ArchiveBox <https://archivebox.io>`_
34 |      - Self-hosted web archiving tool
35 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html>`_
36 |    * - `HTTrack <https://www.httrack.com>`_
37 |      - GUI/CLI website mirroring tool
38 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html>`_
39 |    * - `InterroBot <https://interro.bot>`_
40 |      - GUI crawler and analyzer
41 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html>`_
42 |    * - `Katana <https://github.com/projectdiscovery/katana>`_
43 |      - CLI security-focused crawler
44 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html>`_
45 |    * - `SiteOne <https://crawler.siteone.io>`_
46 |      - GUI crawler and analyzer
47 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html>`_
48 |    * - `WARC <https://en.wikipedia.org/wiki/WARC_(file_format)>`_
49 |      - Standard web archive format
50 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html>`_
51 |    * - `wget <https://en.wikipedia.org/wiki/Wget>`_
52 |      - CLI website mirroring tool
53 |      - `Setup Guide <https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html>`_
54 | 
55 | **mcp-server-webcrawl** is free and open source, and requires Claude Desktop, Python (>=3.10). It is installed on the command line, via pip install:
56 | 
57 | .. code-block:: bash
58 | 
59 |    pip install mcp-server-webcrawl
60 | 
61 | .. raw:: html
62 | 
63 |    <iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/Sid-GBxII1o" frameborder="0" allowfullscreen></iframe>
64 | 
65 | 
66 | .. toctree::
67 |    :maxdepth: 1
68 |    :caption: Contents:
69 | 
70 |    installation
71 |    guides
72 |    usage
73 |    prompts
74 |    interactive
75 |    modules
76 | 
77 | Indices and tables
78 | ------------------
79 | 
80 | * :ref:`genindex`
81 | * :ref:`modindex`
82 | * :ref:`search`
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import sys
 3 | import asyncio
 4 | import tracemalloc
 5 | import unittest
 6 | import logging
 7 | 
 8 | from pathlib import Path
 9 | from argparse import ArgumentParser
10 | 
11 | from mcp_server_webcrawl.utils.cli import get_help_short_message, get_help_long_message
12 | from mcp_server_webcrawl.settings import DEBUG, DATA_DIRECTORY, FIXTURES_DIRECTORY
13 | from mcp_server_webcrawl.crawlers import get_crawler, VALID_CRAWLER_CHOICES
14 | 
15 | __version__: str = "0.15.0"
16 | __name__: str = "mcp-server-webcrawl"
17 | 
18 | if DEBUG:
19 |     tracemalloc.start()
20 | 
21 | class CustomHelpArgumentParser(ArgumentParser):
22 |     def print_help(self, file=None):
23 |         print(get_help_long_message(__version__))
24 | 
25 | def main() -> None:
26 |     """
27 |     Main entry point for the package. mcp-server-webcrawl should be on path if pip installed
28 |     """
29 | 
30 |     if len(sys.argv) == 1:
31 |         # \n parser error follows short message
32 |         sys.stderr.write(get_help_short_message(__version__) + "\n")
33 | 
34 |     parser: CustomHelpArgumentParser = CustomHelpArgumentParser(description="InterrBot MCP Server")
35 |     parser.add_argument("-c", "--crawler", type=str, choices=VALID_CRAWLER_CHOICES,
36 |             help="Specify which crawler to use (default: interrobot)")
37 |     parser.add_argument("--run-tests", action="store_true", help="Run tests instead of server")
38 |     parser.add_argument("-i", "--interactive", action="store_true", help="Run interactive terminal search mode")
39 |     parser.add_argument("-d", "--datasrc", type=str, help="Path to datasrc (required unless testing)")
40 |     args = parser.parse_args()
41 | 
42 |     if args.run_tests:
43 | 
44 |         # Check if FIXTURES_DIRECTORY is configured and exists
45 |         if FIXTURES_DIRECTORY is None or not FIXTURES_DIRECTORY.exists() or not FIXTURES_DIRECTORY.is_dir():
46 |             sys.stderr.write(f"Fixtures not configured in settings_local.py, or is not a valid directory.\nFIXTURES_DIRECTORY: {FIXTURES_DIRECTORY}")
47 |             sys.exit(1)
48 | 
49 |         # testing captures some cross-fixture file information, useful for debug
50 |         # force=True gets this to write during tests (usually quieted during run)
51 |         unittest_log: Path = DATA_DIRECTORY / "fixtures-report.log"
52 |         logging.basicConfig(level=logging.INFO, filename=unittest_log, filemode='w', force=True)
53 |         file_directory = os.path.dirname(os.path.abspath(__file__))
54 |         sys.exit(unittest.main(module=None, argv=["", "discover", "-s", file_directory, "-p", "*test*.py"]))
55 | 
56 |     if args.interactive:
57 |         from mcp_server_webcrawl.interactive.session import InteractiveSession
58 |         intersession = InteractiveSession(args.crawler, args.datasrc)
59 |         intersession.run()
60 |         sys.exit(0)
61 | 
62 |     if not args.datasrc:
63 |         parser.error("the -d/--datasrc argument is required when not in test mode")
64 | 
65 |     if not args.crawler or args.crawler.lower() not in VALID_CRAWLER_CHOICES:
66 |         valid_crawlers = ", ".join(VALID_CRAWLER_CHOICES)
67 |         parser.error(f"the -c/--crawler argument must be one of: {valid_crawlers}")
68 | 
69 |     # cli interaction prior to loading the server
70 |     from mcp_server_webcrawl.main import main as mcp_main
71 |     crawler = get_crawler(args.crawler)
72 |     asyncio.run(mcp_main(crawler, Path(args.datasrc)))
73 | 
74 | __all__ = ["main"]
75 | 
```

--------------------------------------------------------------------------------
/docs/_sources/guides/wget.rst.txt:
--------------------------------------------------------------------------------

```
  1 | wget MCP Setup Guide
  2 | ====================
  3 | 
  4 | Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with
  5 | `wget <https://en.wikipedia.org/wiki/Wget>`_.
  6 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled.
  7 | 
  8 | .. raw:: html
  9 | 
 10 |    <iframe width="560" height="315" src="https://www.youtube.com/embed/uqEEqVsofhc" frameborder="0" allowfullscreen></iframe>
 11 | 
 12 | Follow along with the video, or the step-action guide.
 13 | 
 14 | Requirements
 15 | ------------
 16 | 
 17 | Before you begin, ensure you have:
 18 | 
 19 | - `Claude Desktop <https://claude.ai/download>`_ installed
 20 | - `Python <https://python.org>`_ 3.10 or later installed
 21 | - Basic familiarity with command line interfaces
 22 | - wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)
 23 | 
 24 | Installation Steps
 25 | ------------------
 26 | 
 27 | 1. Install mcp-server-webcrawl
 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 29 | 
 30 | Open your terminal or command line and install the package:
 31 | 
 32 | .. code-block:: bash
 33 | 
 34 |     pip install mcp-server-webcrawl
 35 | 
 36 | Verify installation was successful by checking the version:
 37 | 
 38 | .. code-block:: bash
 39 | 
 40 |     mcp-server-webcrawl --help
 41 | 
 42 | 2. Configure Claude Desktop
 43 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 44 | 
 45 | 1. Open Claude Desktop
 46 | 2. Go to **File → Settings → Developer → Edit Config**
 47 | 
 48 | 3. Add the following configuration (modify paths as needed):
 49 | 
 50 | .. code-block:: json
 51 | 
 52 |     {
 53 |       "mcpServers": {
 54 |         "webcrawl": {
 55 |           "command": "/path/to/mcp-server-webcrawl",
 56 |           "args": ["--crawler", "wget", "--datasrc",
 57 |             "/path/to/wget/archives/"]
 58 |         }
 59 |       }
 60 |     }
 61 | 
 62 | .. note::
 63 |    - On Windows, use ``"mcp-server-webcrawl"`` as the command
 64 |    - On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
 65 |    - Change ``/path/to/wget/archives/`` to your actual directory path
 66 | 
 67 | 4. Save the file and **completely exit** Claude Desktop (not just close the window)
 68 | 5. Restart Claude Desktop
 69 | 
 70 | 3. Crawl Websites with wget
 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 72 | 
 73 | 1. Open Terminal (macOS) or Ubuntu/WSL (Windows)
 74 | 2. Navigate to your target directory for storing crawls
 75 | 3. Run wget with the mirror option:
 76 | 
 77 | .. code-block:: bash
 78 | 
 79 |     wget --mirror https://example.com
 80 | 
 81 | 4. Verify and Use
 82 | ~~~~~~~~~~~~~~~~~
 83 | 
 84 | 1. In Claude Desktop, you should now see an MCP tool option under Search and Tools
 85 | 2. Ask Claude to list your crawled sites:
 86 | 
 87 | .. code-block:: text
 88 | 
 89 |     Can you list the crawled sites available?
 90 | 
 91 | 3. Try searching content from your crawls:
 92 | 
 93 | .. code-block:: text
 94 | 
 95 |     Can you find information about [topic] on [crawled site]?
 96 | 
 97 | Troubleshooting
 98 | ---------------
 99 | 
100 | - If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
101 | - Ensure Python and mcp-server-webcrawl are properly installed, and on PATH or using absolute paths
102 | - Check that your crawl directory path in the configuration is correct
103 | - Remember that the first time you use a function, Claude will ask for permission
104 | - Indexing for file-based archives (wget included) requires build time on first search, time is dependent on archive size
105 | 
106 | For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```

--------------------------------------------------------------------------------
/docs/_static/css/badge_only.css:
--------------------------------------------------------------------------------

```css
1 | .clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions .rst-other-versions .rtd-current-item{font-weight:700}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}#flyout-search-form{padding:6px}
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/crawler.py:
--------------------------------------------------------------------------------

```python
 1 | 
 2 | from pathlib import Path
 3 | 
 4 | from mcp.types import Tool
 5 | 
 6 | from mcp_server_webcrawl.models.sites import SiteResult
 7 | from mcp_server_webcrawl.models.resources import (
 8 |     RESOURCES_FIELDS_DEFAULT,
 9 |     RESOURCES_FIELDS_BASE,
10 |     RESOURCES_DEFAULT_SORT_MAPPING,
11 |     RESOURCES_FIELDS_OPTIONS,
12 | )
13 | from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
14 | from mcp_server_webcrawl.crawlers.interrobot.adapter import (
15 |     get_sites,
16 |     get_resources,
17 |     INTERROBOT_RESOURCE_FIELD_MAPPING,
18 |     INTERROBOT_SITE_FIELD_MAPPING,
19 |     INTERROBOT_SITE_FIELD_REQUIRED,
20 | )
21 | from mcp_server_webcrawl.utils.tools import get_crawler_tools
22 | from mcp_server_webcrawl.utils.logger import get_logger
23 | 
24 | logger = get_logger()
25 | 
26 | 
27 | class InterroBotCrawler(BaseCrawler):
28 |     """
29 |     A crawler implementation for InterroBot data sources.
30 |     Provides functionality for accessing and searching web content from InterroBot.
31 |     """
32 | 
33 |     def __init__(
34 |         self,
35 |         datasrc: Path,
36 |     ) -> None:
37 |         """
38 |         Initialize the InterroBotCrawler with a data source path and required adapter functions.
39 | 
40 |         Args:
41 |             datasrc: Path to the data source
42 |         """
43 |         super().__init__(datasrc, get_sites, get_resources, resource_field_mapping=INTERROBOT_RESOURCE_FIELD_MAPPING)
44 |         assert datasrc.is_file() and datasrc.suffix == ".db", f"{self.__class__.__name__} datasrc must be a db file"
45 | 
46 |     async def mcp_list_tools(self) -> list[Tool]:
47 |         """
48 |         List available tools for this crawler.
49 | 
50 |         Returns:
51 |             List of Tool objects
52 |         """
53 |         # get the default crawler tools, then override necessary fields
54 |         all_sites: list[SiteResult] = self._adapter_get_sites(self._datasrc)
55 |         all_sites_ids: list[int] = [s.id for s in all_sites if s is not None and isinstance(s.id, int)]
56 |         default_tools: list[Tool] = get_crawler_tools(sites=all_sites)
57 |         assert len(default_tools) == 2, "expected exactly 2 Tools: sites and resources"
58 | 
59 |         # can replace get_crawler_tools or extend, here it is overwritten from default
60 |         # you'd think maybe pass changes in, but no, it's better ad hoc
61 |         default_sites_tool: Tool
62 |         default_resources_tool: Tool
63 |         default_sites_tool, default_resources_tool = default_tools
64 |         sites_field_options: list[str] = list(set(INTERROBOT_SITE_FIELD_MAPPING.keys()) - set(INTERROBOT_SITE_FIELD_REQUIRED))
65 |         dst_props: dict = default_sites_tool.inputSchema["properties"]
66 |         dst_props["fields"]["items"]["enum"] = sites_field_options
67 | 
68 |         resources_sort_options: list[str] = list(RESOURCES_DEFAULT_SORT_MAPPING.keys())
69 |         all_sites_display: str = ", ".join([f"{s.name} (site: {s.id})" for s in all_sites])
70 | 
71 |         drt_props: dict = default_resources_tool.inputSchema["properties"]
72 |         drt_props["fields"]["items"]["enum"] = RESOURCES_FIELDS_OPTIONS
73 |         drt_props["sort"]["enum"] = resources_sort_options
74 |         drt_props["sites"]["items"]["enum"] = all_sites_ids
75 |         drt_props["sites"]["description"] = ("Optional "
76 |                 "list of project ID to filter search results to a specific site. In 95% "
77 |                 "of scenarios, you'd filter to only one site, but many site filtering is offered "
78 |                 f"for advanced search scenarios. Available sites include {all_sites_display}.")
79 | 
80 |         return [default_sites_tool, default_resources_tool]
81 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/parsetab.py:
--------------------------------------------------------------------------------

```python
 1 | 
 2 | # parsetab.py
 3 | # This file is automatically generated. Do not edit.
 4 | # pylint: disable=W,C,R
 5 | _tabversion = '3.10'
 6 | 
 7 | _lr_method = 'LALR'
 8 | 
 9 | _lr_signature = 'rightNOTleftANDleftORAND COLON COMPARATOR COMP_OP FIELD LPAREN NOT OR QUOTED_STRING RPAREN TERM URL_FIELD WILDCARD\n        query : expression\n        \n        expression : expression AND expression\n                    | expression OR expression\n                    | expression NOT expression\n        \n        expression : NOT expression\n        \n        expression : LPAREN expression RPAREN\n        \n        expression : URL_FIELD\n        \n        value : TERM\n              | WILDCARD\n              | QUOTED_STRING\n        \n        expression : value\n        \n        expression : FIELD COLON COMP_OP value\n                | FIELD COLON value\n                | FIELD COMPARATOR value\n        '
10 |     
11 | _lr_action_items = {'NOT':([0,2,3,4,5,6,8,9,10,11,12,13,14,15,18,19,20,21,23,24,25,],[3,13,3,3,-7,-11,-8,-9,-10,3,3,3,13,13,-2,-3,13,-6,-13,-14,-12,]),'LPAREN':([0,3,4,11,12,13,],[4,4,4,4,4,4,]),'URL_FIELD':([0,3,4,11,12,13,],[5,5,5,5,5,5,]),'FIELD':([0,3,4,11,12,13,],[7,7,7,7,7,7,]),'TERM':([0,3,4,11,12,13,16,17,22,],[8,8,8,8,8,8,8,8,8,]),'WILDCARD':([0,3,4,11,12,13,16,17,22,],[9,9,9,9,9,9,9,9,9,]),'QUOTED_STRING':([0,3,4,11,12,13,16,17,22,],[10,10,10,10,10,10,10,10,10,]),'$end':([1,2,5,6,8,9,10,14,18,19,20,21,23,24,25,],[0,-1,-7,-11,-8,-9,-10,-5,-2,-3,-4,-6,-13,-14,-12,]),'AND':([2,5,6,8,9,10,14,15,18,19,20,21,23,24,25,],[11,-7,-11,-8,-9,-10,11,11,-2,-3,11,-6,-13,-14,-12,]),'OR':([2,5,6,8,9,10,14,15,18,19,20,21,23,24,25,],[12,-7,-11,-8,-9,-10,12,12,12,-3,12,-6,-13,-14,-12,]),'RPAREN':([5,6,8,9,10,14,15,18,19,20,21,23,24,25,],[-7,-11,-8,-9,-10,-5,21,-2,-3,-4,-6,-13,-14,-12,]),'COLON':([7,],[16,]),'COMPARATOR':([7,],[17,]),'COMP_OP':([16,],[22,]),}
12 | 
13 | _lr_action = {}
14 | for _k, _v in _lr_action_items.items():
15 |    for _x,_y in zip(_v[0],_v[1]):
16 |       if not _x in _lr_action:  _lr_action[_x] = {}
17 |       _lr_action[_x][_k] = _y
18 | del _lr_action_items
19 | 
20 | _lr_goto_items = {'query':([0,],[1,]),'expression':([0,3,4,11,12,13,],[2,14,15,18,19,20,]),'value':([0,3,4,11,12,13,16,17,22,],[6,6,6,6,6,6,23,24,25,]),}
21 | 
22 | _lr_goto = {}
23 | for _k, _v in _lr_goto_items.items():
24 |    for _x, _y in zip(_v[0], _v[1]):
25 |        if not _x in _lr_goto: _lr_goto[_x] = {}
26 |        _lr_goto[_x][_k] = _y
27 | del _lr_goto_items
28 | _lr_productions = [
29 |   ("S' -> query","S'",1,None,None,None),
30 |   ('query -> expression','query',1,'p_query','search.py',317),
31 |   ('expression -> expression AND expression','expression',3,'p_expression_binary','search.py',403),
32 |   ('expression -> expression OR expression','expression',3,'p_expression_binary','search.py',404),
33 |   ('expression -> expression NOT expression','expression',3,'p_expression_binary','search.py',405),
34 |   ('expression -> NOT expression','expression',2,'p_expression_not','search.py',462),
35 |   ('expression -> LPAREN expression RPAREN','expression',3,'p_expression_group','search.py',477),
36 |   ('expression -> URL_FIELD','expression',1,'p_expression_url_field','search.py',483),
37 |   ('value -> TERM','value',1,'p_value','search.py',503),
38 |   ('value -> WILDCARD','value',1,'p_value','search.py',504),
39 |   ('value -> QUOTED_STRING','value',1,'p_value','search.py',505),
40 |   ('expression -> value','expression',1,'p_expression_term','search.py',519),
41 |   ('expression -> FIELD COLON COMP_OP value','expression',4,'p_expression_field_search','search.py',532),
42 |   ('expression -> FIELD COLON value','expression',3,'p_expression_field_search','search.py',533),
43 |   ('expression -> FIELD COMPARATOR value','expression',3,'p_expression_field_search','search.py',534),
44 | ]
45 | 
```

--------------------------------------------------------------------------------
/docs/_sources/guides/siteone.rst.txt:
--------------------------------------------------------------------------------

```
  1 | SiteOne MCP Setup Guide
  2 | =======================
  3 | 
  4 | Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with SiteOne crawler.
  5 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled using SiteOne.
  6 | 
  7 | .. raw:: html
  8 | 
  9 |    <iframe width="560" height="315" src="https://www.youtube.com/embed/JOGRYbo6WwI" frameborder="0" allowfullscreen></iframe>
 10 | 
 11 | Follow along with the video, or the step-action guide below.
 12 | 
 13 | Requirements
 14 | ------------
 15 | 
 16 | Before you begin, ensure you have:
 17 | 
 18 | - `Claude Desktop <https://claude.ai/download>`_ installed
 19 | - `Python <https://python.org>`_ 3.10 or later installed
 20 | - `SiteOne Crawler <https://crawler.siteone.io>`_ installed
 21 | - Basic familiarity with command line interfaces
 22 | 
 23 | What is SiteOne?
 24 | ----------------
 25 | 
 26 | SiteOne is a GUI crawler that offers:
 27 | 
 28 | - User-friendly desktop interface for setting up and managing crawls
 29 | - Offline website generation capabilities
 30 | - Comprehensive crawl reporting
 31 | - Intuitive controls for non-technical users
 32 | 
 33 | Installation Steps
 34 | ------------------
 35 | 
 36 | 1. Install mcp-server-webcrawl
 37 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | Open your terminal or command line and install the package::
 40 | 
 41 |     pip install mcp-server-webcrawl
 42 | 
 43 | Verify installation was successful::
 44 | 
 45 |     mcp-server-webcrawl --help
 46 | 
 47 | 2. Create Crawls with SiteOne
 48 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 49 | 
 50 | 1. Open SiteOne Crawler application
 51 | 2. Enter a URL to crawl (e.g., example.com)
 52 | 3. **Important**: Check the "Generate offline website" option (this is required for MCP integration)
 53 | 4. Click the start button to begin crawling
 54 | 5. Repeat for additional sites as needed (e.g., pragmar.com)
 55 | 6. Note the directory where SiteOne is storing the generated offline content (this is shown in the application)
 56 | 
 57 | 3. Configure Claude Desktop
 58 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 59 | 
 60 | 1. Open Claude Desktop
 61 | 2. Go to **File → Settings → Developer → Edit Config**
 62 | 3. Add the following configuration (modify paths as needed):
 63 | 
 64 | .. code-block:: json
 65 | 
 66 |     {
 67 |       "mcpServers": {
 68 |         "webcrawl": {
 69 |           "command": "/path/to/mcp-server-webcrawl",
 70 |           "args": ["--crawler", "siteone", "--datasrc",
 71 |             "/path/to/siteone/archives/"]
 72 |         }
 73 |       }
 74 |     }
 75 | 
 76 | .. note::
 77 |    - On Windows, use ``"mcp-server-webcrawl"`` as the command
 78 |    - On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
 79 |    - Change ``/path/to/siteone/archives/`` to the actual path where SiteOne stores offline website content
 80 | 
 81 | 4. Save the file and **completely exit** Claude Desktop (not just close the window)
 82 | 5. Restart Claude Desktop
 83 | 
 84 | 4. Verify and Use
 85 | ~~~~~~~~~~~~~~~~~
 86 | 
 87 | 1. In Claude Desktop, you should now see MCP tools available under Search and Tools
 88 | 2. Ask Claude to list your crawled sites::
 89 | 
 90 |     Can you list the crawled sites available?
 91 | 
 92 | 3. Try searching content from your crawls::
 93 | 
 94 |     Can you find information about [topic] on [crawled site]?
 95 | 
 96 | 4. Explore specific topics on your crawled sites::
 97 | 
 98 |     I'm interested in [keyword] in [crawled domain]. Can you tell me about it?
 99 | 
100 | Troubleshooting
101 | ---------------
102 | 
103 | - If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
104 | - Ensure Python and mcp-server-webcrawl are properly installed
105 | - Check that your SiteOne archives path in the configuration is correct
106 | - Make sure the "Generate offline website" option was checked when creating crawls
107 | - Verify that each crawl completed successfully and files were saved to the expected location
108 | - Remember that the first time you use a function, Claude will ask for permission
109 | 
110 | For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```

--------------------------------------------------------------------------------
/docs/_sources/guides/warc.rst.txt:
--------------------------------------------------------------------------------

```
  1 | WARC MCP Setup Guide
  2 | ====================
  3 | 
  4 | Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with
  5 | `WARC <https://en.wikipedia.org/wiki/WARC_\(file_format\)>`_ files to allow your LLM (e.g.
  6 | Claude Desktop) to search content and metadata from websites you've archived in WARC format.
  7 | 
  8 | .. raw:: html
  9 | 
 10 |    <iframe width="560" height="315" src="https://www.youtube.com/embed/fx-4WZu-UT8" frameborder="0" allowfullscreen></iframe>
 11 | 
 12 | Follow along with the video, or the step-action guide below.
 13 | 
 14 | Requirements
 15 | ------------
 16 | 
 17 | Before you begin, ensure you have:
 18 | 
 19 | - `Claude Desktop <https://claude.ai/download>`_ installed
 20 | - `Python <https://python.org>`_ 3.10 or later installed
 21 | - Basic familiarity with command line interfaces
 22 | - wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)
 23 | 
 24 | What are WARC Files?
 25 | --------------------
 26 | 
 27 | WARC files are single-file archives that store complete crawl data including:
 28 | 
 29 | - HTTP status codes
 30 | - HTTP headers
 31 | - Response content
 32 | 
 33 | Compared to wget running in mirror mode:
 34 | 
 35 | - **WARC**: More comprehensive (preserves status codes and headers) but slower crawling
 36 | - **wget mirror**: Faster crawling but doesn't preserve status codes or headers
 37 | 
 38 | Installation Steps
 39 | ------------------
 40 | 
 41 | 1. Install mcp-server-webcrawl
 42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 43 | 
 44 | Open your terminal or command line and install the package::
 45 | 
 46 |     pip install mcp-server-webcrawl
 47 | 
 48 | Verify installation was successful::
 49 | 
 50 |     mcp-server-webcrawl --help
 51 | 
 52 | 2. Configure Claude Desktop
 53 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 54 | 
 55 | 1. Open Claude Desktop
 56 | 2. Go to **File → Settings → Developer → Edit Config**
 57 | 3. Add the following configuration (modify paths as needed):
 58 | 
 59 | .. code-block:: json
 60 | 
 61 |     {
 62 |       "mcpServers": {
 63 |         "webcrawl": {
 64 |           "command": "/path/to/mcp-server-webcrawl",
 65 |           "args": ["--crawler", "warc", "--datasrc",
 66 |             "/path/to/warc/archives/"]
 67 |         }
 68 |       }
 69 |     }
 70 | 
 71 | .. note::
 72 |    - On Windows, use ``"mcp-server-webcrawl"`` as the command
 73 |    - On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
 74 |    - Change ``/path/to/warc/archives/`` to your actual directory path where WARC files are stored
 75 | 
 76 | 4. Save the file and **completely exit** Claude Desktop (not just close the window)
 77 | 5. Restart Claude Desktop
 78 | 
 79 | 3. Create WARC Files with Wget
 80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 81 | 
 82 | 1. Open Terminal (macOS) or Ubuntu/WSL (Windows)
 83 | 2. Navigate to your target directory for storing WARC files
 84 | 3. Run wget with WARC options:
 85 | 
 86 | .. code-block:: bash
 87 | 
 88 |     # basic WARC capture
 89 |     wget --warc-file=example --recursive https://example.com
 90 | 
 91 |     # more comprehensive capture with page requirements (CSS, images, etc.)
 92 |     wget --warc-file=example --recursive --page-requisites https://example.com
 93 | 
 94 | Your WARC files will be created with a .warc.gz extension in your current directory.
 95 | 
 96 | 4. Verify and Use
 97 | ~~~~~~~~~~~~~~~~~
 98 | 
 99 | 1. In Claude Desktop, you should now see MCP tools available under Search and Tools
100 | 2. Ask Claude to list your crawled sites::
101 | 
102 |     Can you list the crawled sites available?
103 | 
104 | 3. Try searching content from your crawls::
105 | 
106 |     Can you find information about [topic] on [crawled site]?
107 | 
108 | Troubleshooting
109 | ---------------
110 | 
111 | - If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
112 | - Ensure Python and mcp-server-webcrawl are properly installed
113 | - Check that your WARC directory path in the configuration is correct
114 | - Make sure your WARC files have the correct extension (typically .warc.gz)
115 | - Remember that the first time you use each function, Claude will ask for permission
116 | - For large WARC files, initial indexing may take some time
117 | 
118 | For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/regex.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | 
  3 | from functools import lru_cache
  4 | from typing import Final
  5 | from logging import Logger
  6 | 
  7 | from mcp_server_webcrawl.utils.logger import get_logger
  8 | 
  9 | __REGEX_PATTERNS_REGEX_HAZARDS: Final[list[str]] = [
 10 |     r"\([^)]*\*[^)]*\+",                   # (.*)*+, (.+)*+, etc.
 11 |     r"\([^)]*\+[^)]*\*",                   # (.+)*., (.*)++, etc.
 12 |     r"\([^)]*\+[^)]*\+",                   # (.+)+, (.++)+ etc.
 13 |     r"\([^)]*\*[^)]*\*",                   # (.*)*, (.**) etc.
 14 |     r"\.\*.*\.\*",                         # .*.* patterns
 15 |     r"\.\+.*\.\+",                         # .+.+ patterns
 16 |     r"\([^)]*\?\)\*",                      # (a?)* patterns
 17 |     r"\([^)]*\?\)\+",                      # (a?)+ patterns
 18 |     r"\([^)]*[*+?][^)]*[*+?][^)]*\)[*+]",  # 2+ quantifiers inside, then quantifier outside
 19 | ]
 20 | 
 21 | logger: Logger = get_logger()
 22 | 
 23 | @lru_cache(maxsize=None)
 24 | def __get_compiled_hazard_patterns():
 25 |     """
 26 |     Lazy load compiled patterns
 27 |     """
 28 |     compiled_patterns = []
 29 |     for hazard in __REGEX_PATTERNS_REGEX_HAZARDS:
 30 |         try:
 31 |             compiled_patterns.append(re.compile(hazard))
 32 |         except re.error as ex:
 33 |             logger.warning(f"Invalid hazard pattern {hazard}: {ex}")
 34 |             continue
 35 |     return compiled_patterns
 36 | 
 37 | def __regex_is_hazardous(pattern: str) -> bool:
 38 |     """
 39 |     Check if a regex pattern might cause catastrophic backtracking
 40 |     or otherwise unacceptable performance over up to 100 HTML files
 41 |     """
 42 | 
 43 |     compiled_hazards = __get_compiled_hazard_patterns()
 44 | 
 45 |     for hazard_pattern in compiled_hazards:
 46 |         try:
 47 |             if hazard_pattern.search(pattern):
 48 |                 logger.error(f"hazardous regex discarded {pattern} matched {hazard_pattern.pattern}")
 49 |                 return True
 50 |         except re.error as ex:
 51 |             logger.warning(f"Error checking hazard pattern {hazard_pattern.pattern}: {ex}")
 52 |             continue
 53 | 
 54 |     return False
 55 | 
 56 | def get_regex(headers: str, content: str, patterns: list[str]) -> list[dict[str, str | int]]:
 57 |     """
 58 |     Takes headers and content and gets regex matches
 59 | 
 60 |     Arguments:
 61 |         headers: The headers to search
 62 |         content: The content to search
 63 |         patterns: The regex patterns
 64 | 
 65 |     Returns:
 66 |         A list of dicts, with selector, value, groups, position info, and source
 67 |     """
 68 | 
 69 |     if not isinstance(content, str):
 70 |         content = ""
 71 |     if not isinstance(headers, str):
 72 |         headers = ""
 73 | 
 74 |     if not isinstance(patterns, list) or not all(isinstance(item, str) for item in patterns):
 75 |         raise ValueError("patterns must be a list of strings")
 76 | 
 77 |     results = []
 78 | 
 79 |     if content == "" and headers == "":
 80 |         return results
 81 | 
 82 |     re_patterns = []
 83 |     for pattern in patterns:
 84 |         if __regex_is_hazardous(pattern):
 85 |             logger.warning(f"Hazardous regex pattern '{pattern}'")
 86 |             continue
 87 | 
 88 |         try:
 89 |             re_pattern = re.compile(pattern)
 90 |             re_patterns.append(re_pattern)
 91 |         except re.error as ex:
 92 |             logger.warning(f"Invalid regex pattern '{pattern}': {ex}")
 93 |             continue
 94 | 
 95 |     # search headers and content
 96 |     search_targets = [("headers", headers), ("content", content)]
 97 | 
 98 |     for re_pattern in re_patterns:
 99 |         for source_name, search_text in search_targets:
100 |             if not search_text:
101 |                 continue
102 | 
103 |             for match in re_pattern.finditer(search_text):
104 |                 regex_hit: dict[str, str | int] = {
105 |                     "selector": re_pattern.pattern,
106 |                     "value": match.group(0),
107 |                     "source": source_name  # headers or content
108 |                 }
109 | 
110 |                 if match.groups():
111 |                     for i, group in enumerate(match.groups(), 1):
112 |                         if group is not None:
113 |                             regex_hit[f"group_{i}"] = group
114 | 
115 |                 regex_hit["start"] = match.start()
116 |                 regex_hit["end"] = match.end()
117 |                 results.append(regex_hit)
118 | 
119 |     return results
```

--------------------------------------------------------------------------------
/docs/_sources/guides/interrobot.rst.txt:
--------------------------------------------------------------------------------

```
  1 | InterroBot MCP Setup Guide
  2 | ==========================
  3 | 
  4 | Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with InterroBot.
  5 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled with InterroBot.
  6 | 
  7 | .. raw:: html
  8 | 
  9 |    <iframe width="560" height="315" src="https://www.youtube.com/embed/55y8oKWXJLs" frameborder="0" allowfullscreen></iframe>
 10 | 
 11 | Follow along with the video, or the step-action guide below.
 12 | 
 13 | Requirements
 14 | ------------
 15 | 
 16 | Before you begin, ensure you have:
 17 | 
 18 | - `Claude Desktop <https://claude.ai/download>`_ installed
 19 | - `Python <https://python.org>`_ 3.10 or later installed
 20 | - `InterroBot <https://interro.bot>`_ installed
 21 | - Basic familiarity with command line interfaces
 22 | 
 23 | What is InterroBot?
 24 | -------------------
 25 | 
 26 | InterroBot is a commercial web crawler and analyzer that works seamlessly with mcp-server-webcrawl, providing several advantages:
 27 | 
 28 | - User-friendly graphical interface for managing crawls
 29 | - Comprehensive data collection including page content and metadata
 30 | - Natively indexed, no first search build lag
 31 | - Cross-platform \(Windows, macOS, Android\)
 32 | 
 33 | Installation Steps
 34 | ------------------
 35 | 
 36 | 1. Install mcp-server-webcrawl
 37 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | Open your terminal or command line and install the package::
 40 | 
 41 |     pip install mcp-server-webcrawl
 42 | 
 43 | Verify installation was successful::
 44 | 
 45 |     mcp-server-webcrawl --help
 46 | 
 47 | 2. Create Crawls with InterroBot
 48 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 49 | 
 50 | 1. Open InterroBot
 51 | 2. For a new project, you'll see an empty project screen
 52 | 3. Add websites to crawl by entering URLs (e.g., example.com, pragmar.com)
 53 | 4. Wait for the crawling to complete (typically takes a few seconds to minutes depending on site size)
 54 | 5. Note the location of your InterroBot database file, which will be needed for configuration. It is available in InterroBot options, under Advanced section:
 55 |    - On Windows: Typically in ``[homedir]/Documents/InterroBot/interrobot.v2.db``
 56 |    - On macOS: Path can be found in InterroBot settings page
 57 | 
 58 | 3. Configure Claude Desktop
 59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 60 | 
 61 | 1. Open Claude Desktop
 62 | 2. Go to **File → Settings → Developer → Edit Config**
 63 | 3. Add the following configuration (modify paths as needed):
 64 | 
 65 | .. code-block:: json
 66 | 
 67 |     {
 68 |       "mcpServers": {
 69 |         "webcrawl": {
 70 |           "command": "/path/to/mcp-server-webcrawl",
 71 |           "args": ["--crawler", "interrobot", "--datasrc",
 72 |             "[homedir]/Documents/InterroBot/interrobot.v2.db"]
 73 |         }
 74 |       }
 75 |     }
 76 | 
 77 | .. note::
 78 |    - On Windows, use ``"mcp-server-webcrawl"`` as the command
 79 |    - On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
 80 |    - Replace ``[homedir]/Documents/InterroBot/interrobot.v2.db`` with the actual path to your InterroBot database file, available in InterroBot options
 81 | 
 82 | 4. Save the file and **completely exit** Claude Desktop (not just close the window)
 83 | 5. Restart Claude Desktop
 84 | 
 85 | 4. Verify and Use
 86 | ~~~~~~~~~~~~~~~~~
 87 | 
 88 | 1. In Claude Desktop, you should now see MCP tools available under Search and Tools
 89 | 2. Ask Claude to list your crawled sites::
 90 | 
 91 |     Can you list the crawled sites available?
 92 | 
 93 | 3. Try searching content from your crawls::
 94 | 
 95 |     Can you find information about [topic] on [crawled site]?
 96 | 
 97 | 4. Explore specific capabilities, such as generating site reports::
 98 | 
 99 |     Can you give me a file type summary for [crawled site]? Which types of files are there, page count, etc.
100 | 
101 | Troubleshooting
102 | ---------------
103 | 
104 | - If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
105 | - Ensure Python and mcp-server-webcrawl are properly installed
106 | - Check that your InterroBot database path in the configuration is correct
107 | - Make sure InterroBot has successfully completed crawling the websites
108 | - Remember that the first time you use a function, Claude will ask for permission
109 | - For large websites with many pages, search queries might take longer to process initially
110 | 
111 | For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/templates/tests.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | import unittest
 3 | 
 4 | from importlib import resources
 5 | from urllib.request import urlopen
 6 | from mcp_server_webcrawl.utils.logger import get_logger
 7 | from mcp_server_webcrawl.extras.markdown import get_markdown
 8 | 
 9 | logger = get_logger()
10 | 
11 | class TemplateTests(unittest.TestCase):
12 |     """
13 |     Test suite for the custom HTML to markdown converter.
14 |     Why custom? It's a bit faster, that is the only reason.
15 |     Maximum load is 100 transforms (1 per result for a max result 
16 |     of 100), so speed matters. A default set is 20.
17 |     This converter does a few things differently to tailor to LLM
18 |     interaction.
19 |     * aggressively removes images (html2text selectively renders)
20 |     * links with block decendents will render like a <p> 
21 |         (html2text treats as <a><br>)    
22 |     """
23 | 
24 |     def setUp(self):
25 |         """
26 |         Set up the test environment with fixture data.
27 |         """
28 |         super().setUp()
29 | 
30 |     def test_core_html(self):
31 |         core_html: str = resources.read_text("mcp_server_webcrawl.templates", "tests_core.html")
32 |         markdown = get_markdown(core_html)
33 | 
34 |         # h1-6
35 |         self.assertIn("# Lorem Ipsum Dolor Sit Amet", markdown)
36 |         self.assertIn("## Consectetur Adipiscing Elit", markdown)
37 |         self.assertIn("### Nemo Enim Ipsam Voluptatem", markdown)
38 |         self.assertIn("#### Sed Quia Non Numquam", markdown)
39 |         self.assertIn("##### Nisi Ut Aliquid Ex Ea", markdown)
40 |         self.assertIn("###### At Vero Eos Et Accusamus", markdown)
41 | 
42 |         # no content loss - key phrases should be preserved
43 |         self.assertIn("Lorem ipsum dolor sit amet", markdown)
44 |         self.assertIn("Definition List Example", markdown)
45 |         self.assertIn("More Text Elements", markdown)
46 | 
47 |         # inline formatting (proper spacing)
48 |         self.assertIn("amet, **consectetur adipiscing elit**. Sed", markdown)
49 |         self.assertIn("laborum. **Sed ut perspiciatis** unde", markdown)
50 |         self.assertIn("consequat. *Duis aute irure dolor* in", markdown)
51 |         self.assertIn("laudantium. *Totam rem aperiam*, eaque", markdown)
52 | 
53 |         # link formatting (proper spacing)
54 |         self.assertIn("veniam, quis nostrud exercitation ullamco", markdown)  # Fragment links as plain text
55 |         self.assertIn("and a link back to top. Nam", markdown)
56 | 
57 |         # list formatting
58 |         self.assertIn("* Similique sunt in culpa", markdown)
59 |         self.assertIn("1. Temporibus autem quibusdam", markdown)
60 | 
61 |         # dl/dt
62 |         self.assertIn("**Lorem Ipsum**", markdown)
63 |         self.assertIn("    Dolor sit amet, consectetur adipiscing elit", markdown)
64 |         self.assertIn("**Ut Enim**", markdown)
65 |         self.assertIn("    Ad minim veniam, quis nostrud exercitation", markdown)
66 |         self.assertIn("**Duis Aute**", markdown)
67 |         self.assertIn("    Irure dolor in reprehenderit in voluptate", markdown)
68 | 
69 |         # table structure
70 |         self.assertIn("| Lorem | Ipsum | Dolor | Sit |", markdown)
71 |         self.assertIn("|---|---|---|---|", markdown)
72 |         self.assertIn("| Consectetur | Adipiscing | Elit | Sed |", markdown)
73 | 
74 |         # code formatting
75 |         self.assertIn("Here we have some `inline code` and", markdown)
76 |         self.assertIn("```\nfunction lorem() {\n    return \"ipsum dolor sit amet\";\n}\n```", markdown)
77 | 
78 |         # blockquotes
79 |         self.assertIn("> \"Sed ut perspiciatis unde omnis iste natus", markdown)
80 | 
81 |         # horizontal rule
82 |         self.assertIn("---", markdown)
83 | 
84 |         # no double spacing for inline elements
85 |         self.assertNotIn("**  ", markdown)  # No double spaces after bold
86 |         self.assertNotIn("  **", markdown)  # No double spaces before bold
87 |         self.assertNotIn("*  ", markdown)   # No double spaces after emphasis
88 |         self.assertNotIn("  *", markdown)   # No double spaces before emphasis
89 | 
90 |         # structural integrity - count major elements
91 |         heading_count = len(re.findall(r"^#{1,6} ", markdown, re.MULTILINE))
92 |         self.assertEqual(heading_count, 11, "Should have exactly 6 headings")
93 |         table_count = len(re.findall(r"^\|.*\|$", markdown, re.MULTILINE))
94 |         self.assertGreater(table_count, 5, "Should have multiple table rows")
95 | 
96 | 
```

--------------------------------------------------------------------------------
/docs/_sources/prompts.rst.txt:
--------------------------------------------------------------------------------

```
 1 | 
 2 | Prompt Routines
 3 | ===============
 4 | 
 5 | **mcp-server-webcrawl** provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.
 6 | 
 7 | It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are **copy and paste**, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.
 8 | 
 9 | If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type "run pasted for [site name or URL]." It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).
10 | 
11 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
12 | | Prompt                    | Download          | Category   | Description                                                             |
13 | +===========================+===================+============+=========================================================================+
14 | | 🔍 **SEO Audit**          | `auditseo.md`_    | audit      | Technical SEO (search engine optimization) analysis. Covers the         |
15 | |                           |                   |            | basics, with options to dive deeper.                                    |
16 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
17 | | 🔗 **404 Audit**          | `audit404.md`_    | audit      | Broken link detection and pattern analysis. Not only finds issues,      |
18 | |                           |                   |            | but suggests fixes.                                                     |
19 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
20 | | ⚡ **Performance Audit**  | `auditperf.md`_   | audit      | Website speed and optimization analysis. Real talk.                     |
21 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
22 | | 📁 **File Audit**         | `auditfiles.md`_  | audit      | File organization and asset analysis. Discover the composition of       |
23 | |                           |                   |            | your website.                                                           |
24 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
25 | | 🌐 **Gopher Interface**   | `gopher.md`_      | interface  | An old-fashioned search interface inspired by the Gopher clients of     |
26 | |                           |                   |            | yesteryear.                                                             |
27 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
28 | | ⚙️ **Search Test**        | `testsearch.md`_  | self-test  | A battery of tests to check for Boolean logical inconsistencies in      |
29 | |                           |                   |            | the search query parser and subsequent FTS5 conversion.                 |
30 | +---------------------------+-------------------+------------+-------------------------------------------------------------------------+
31 | 
32 | .. _auditseo.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md
33 | .. _audit404.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md
34 | .. _auditperf.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md
35 | .. _auditfiles.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md
36 | .. _gopher.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md
37 | .. _testsearch.md: https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md
```

--------------------------------------------------------------------------------
/docs/_static/_sphinx_javascript_frameworks_compat.js:
--------------------------------------------------------------------------------

```javascript
  1 | /* Compatability shim for jQuery and underscores.js.
  2 |  *
  3 |  * Copyright Sphinx contributors
  4 |  * Released under the two clause BSD licence
  5 |  */
  6 | 
  7 | /**
  8 |  * small helper function to urldecode strings
  9 |  *
 10 |  * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL
 11 |  */
 12 | jQuery.urldecode = function(x) {
 13 |     if (!x) {
 14 |         return x
 15 |     }
 16 |     return decodeURIComponent(x.replace(/\+/g, ' '));
 17 | };
 18 | 
 19 | /**
 20 |  * small helper function to urlencode strings
 21 |  */
 22 | jQuery.urlencode = encodeURIComponent;
 23 | 
 24 | /**
 25 |  * This function returns the parsed url parameters of the
 26 |  * current request. Multiple values per key are supported,
 27 |  * it will always return arrays of strings for the value parts.
 28 |  */
 29 | jQuery.getQueryParameters = function(s) {
 30 |     if (typeof s === 'undefined')
 31 |         s = document.location.search;
 32 |     var parts = s.substr(s.indexOf('?') + 1).split('&');
 33 |     var result = {};
 34 |     for (var i = 0; i < parts.length; i++) {
 35 |         var tmp = parts[i].split('=', 2);
 36 |         var key = jQuery.urldecode(tmp[0]);
 37 |         var value = jQuery.urldecode(tmp[1]);
 38 |         if (key in result)
 39 |             result[key].push(value);
 40 |         else
 41 |             result[key] = [value];
 42 |     }
 43 |     return result;
 44 | };
 45 | 
 46 | /**
 47 |  * highlight a given string on a jquery object by wrapping it in
 48 |  * span elements with the given class name.
 49 |  */
 50 | jQuery.fn.highlightText = function(text, className) {
 51 |     function highlight(node, addItems) {
 52 |         if (node.nodeType === 3) {
 53 |             var val = node.nodeValue;
 54 |             var pos = val.toLowerCase().indexOf(text);
 55 |             if (pos >= 0 &&
 56 |                 !jQuery(node.parentNode).hasClass(className) &&
 57 |                 !jQuery(node.parentNode).hasClass("nohighlight")) {
 58 |                 var span;
 59 |                 var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg");
 60 |                 if (isInSVG) {
 61 |                     span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
 62 |                 } else {
 63 |                     span = document.createElement("span");
 64 |                     span.className = className;
 65 |                 }
 66 |                 span.appendChild(document.createTextNode(val.substr(pos, text.length)));
 67 |                 node.parentNode.insertBefore(span, node.parentNode.insertBefore(
 68 |                     document.createTextNode(val.substr(pos + text.length)),
 69 |                     node.nextSibling));
 70 |                 node.nodeValue = val.substr(0, pos);
 71 |                 if (isInSVG) {
 72 |                     var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
 73 |                     var bbox = node.parentElement.getBBox();
 74 |                     rect.x.baseVal.value = bbox.x;
 75 |                     rect.y.baseVal.value = bbox.y;
 76 |                     rect.width.baseVal.value = bbox.width;
 77 |                     rect.height.baseVal.value = bbox.height;
 78 |                     rect.setAttribute('class', className);
 79 |                     addItems.push({
 80 |                         "parent": node.parentNode,
 81 |                         "target": rect});
 82 |                 }
 83 |             }
 84 |         }
 85 |         else if (!jQuery(node).is("button, select, textarea")) {
 86 |             jQuery.each(node.childNodes, function() {
 87 |                 highlight(this, addItems);
 88 |             });
 89 |         }
 90 |     }
 91 |     var addItems = [];
 92 |     var result = this.each(function() {
 93 |         highlight(this, addItems);
 94 |     });
 95 |     for (var i = 0; i < addItems.length; ++i) {
 96 |         jQuery(addItems[i].parent).before(addItems[i].target);
 97 |     }
 98 |     return result;
 99 | };
100 | 
101 | /*
102 |  * backward compatibility for jQuery.browser
103 |  * This will be supported until firefox bug is fixed.
104 |  */
105 | if (!jQuery.browser) {
106 |     jQuery.uaMatch = function(ua) {
107 |         ua = ua.toLowerCase();
108 | 
109 |         var match = /(chrome)[ \/]([\w.]+)/.exec(ua) ||
110 |             /(webkit)[ \/]([\w.]+)/.exec(ua) ||
111 |             /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) ||
112 |             /(msie) ([\w.]+)/.exec(ua) ||
113 |             ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) ||
114 |             [];
115 | 
116 |         return {
117 |             browser: match[ 1 ] || "",
118 |             version: match[ 2 ] || "0"
119 |         };
120 |     };
121 |     jQuery.browser = {};
122 |     jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true;
123 | }
124 | 
```

--------------------------------------------------------------------------------
/docs/search.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="./">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>Search &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |     
 14 |       <script src="_static/jquery.js?v=5d32c60e"></script>
 15 |       <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 16 |       <script src="_static/documentation_options.js?v=5929fcd5"></script>
 17 |       <script src="_static/doctools.js?v=888ff710"></script>
 18 |       <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
 19 |     <script src="_static/js/theme.js"></script>
 20 |     <script src="_static/searchtools.js"></script>
 21 |     <script src="_static/language_data.js"></script>
 22 |     <link rel="index" title="Index" href="genindex.html" />
 23 |     <link rel="search" title="Search" href="#" /> 
 24 | </head>
 25 | 
 26 | <body class="wy-body-for-nav"> 
 27 |   <div class="wy-grid-for-nav">
 28 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 29 |       <div class="wy-side-scroll">
 30 |         <div class="wy-side-nav-search" >
 31 | 
 32 |           
 33 |           
 34 |           <a href="index.html" class="icon icon-home">
 35 |             mcp-server-webcrawl
 36 |           </a>
 37 | <div role="search">
 38 |   <form id="rtd-search-form" class="wy-form" action="#" method="get">
 39 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 40 |     <input type="hidden" name="check_keywords" value="yes" />
 41 |     <input type="hidden" name="area" value="default" />
 42 |   </form>
 43 | </div>
 44 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 45 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 46 | <ul>
 47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
 50 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
 51 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
 52 | <li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
 53 | </ul>
 54 | 
 55 |         </div>
 56 |       </div>
 57 |     </nav>
 58 | 
 59 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 60 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 61 |           <a href="index.html">mcp-server-webcrawl</a>
 62 |       </nav>
 63 | 
 64 |       <div class="wy-nav-content">
 65 |         <div class="rst-content">
 66 |           <div role="navigation" aria-label="Page navigation">
 67 |   <ul class="wy-breadcrumbs">
 68 |       <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
 69 |       <li class="breadcrumb-item active">Search</li>
 70 |       <li class="wy-breadcrumbs-aside">
 71 |       </li>
 72 |   </ul>
 73 |   <hr/>
 74 | </div>
 75 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 76 |            <div itemprop="articleBody">
 77 |              
 78 |   <noscript>
 79 |   <div id="fallback" class="admonition warning">
 80 |     <p class="last">
 81 |       Please activate JavaScript to enable the search functionality.
 82 |     </p>
 83 |   </div>
 84 |   </noscript>
 85 | 
 86 |   
 87 |   <div id="search-results">
 88 |   
 89 |   </div>
 90 | 
 91 |            </div>
 92 |           </div>
 93 |           <footer>
 94 | 
 95 |   <hr/>
 96 | 
 97 |   <div role="contentinfo">
 98 |     <p>&#169; Copyright 2025, pragmar.</p>
 99 |   </div>
100 | 
101 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
102 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
103 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
104 |    
105 | 
106 | </footer>
107 |         </div>
108 |       </div>
109 |     </section>
110 |   </div>
111 |   <script>
112 |       jQuery(function () {
113 |           SphinxRtdTheme.Navigation.enable(true);
114 |       });
115 |   </script>
116 |   <script>
117 |     jQuery(function() { Search.loadIndex("searchindex.js"); });
118 |   </script>
119 |   
120 |   <script id="searchindexloader"></script>
121 |    
122 | 
123 | 
124 | </body>
125 | </html>
```

--------------------------------------------------------------------------------
/docs/_static/doctools.js:
--------------------------------------------------------------------------------

```javascript
  1 | /*
  2 |  * doctools.js
  3 |  * ~~~~~~~~~~~
  4 |  *
  5 |  * Base JavaScript utilities for all Sphinx HTML documentation.
  6 |  *
  7 |  * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 | "use strict";
 12 | 
 13 | const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
 14 |   "TEXTAREA",
 15 |   "INPUT",
 16 |   "SELECT",
 17 |   "BUTTON",
 18 | ]);
 19 | 
 20 | const _ready = (callback) => {
 21 |   if (document.readyState !== "loading") {
 22 |     callback();
 23 |   } else {
 24 |     document.addEventListener("DOMContentLoaded", callback);
 25 |   }
 26 | };
 27 | 
 28 | /**
 29 |  * Small JavaScript module for the documentation.
 30 |  */
 31 | const Documentation = {
 32 |   init: () => {
 33 |     Documentation.initDomainIndexTable();
 34 |     Documentation.initOnKeyListeners();
 35 |   },
 36 | 
 37 |   /**
 38 |    * i18n support
 39 |    */
 40 |   TRANSLATIONS: {},
 41 |   PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
 42 |   LOCALE: "unknown",
 43 | 
 44 |   // gettext and ngettext don't access this so that the functions
 45 |   // can safely bound to a different name (_ = Documentation.gettext)
 46 |   gettext: (string) => {
 47 |     const translated = Documentation.TRANSLATIONS[string];
 48 |     switch (typeof translated) {
 49 |       case "undefined":
 50 |         return string; // no translation
 51 |       case "string":
 52 |         return translated; // translation exists
 53 |       default:
 54 |         return translated[0]; // (singular, plural) translation tuple exists
 55 |     }
 56 |   },
 57 | 
 58 |   ngettext: (singular, plural, n) => {
 59 |     const translated = Documentation.TRANSLATIONS[singular];
 60 |     if (typeof translated !== "undefined")
 61 |       return translated[Documentation.PLURAL_EXPR(n)];
 62 |     return n === 1 ? singular : plural;
 63 |   },
 64 | 
 65 |   addTranslations: (catalog) => {
 66 |     Object.assign(Documentation.TRANSLATIONS, catalog.messages);
 67 |     Documentation.PLURAL_EXPR = new Function(
 68 |       "n",
 69 |       `return (${catalog.plural_expr})`
 70 |     );
 71 |     Documentation.LOCALE = catalog.locale;
 72 |   },
 73 | 
 74 |   /**
 75 |    * helper function to focus on search bar
 76 |    */
 77 |   focusSearchBar: () => {
 78 |     document.querySelectorAll("input[name=q]")[0]?.focus();
 79 |   },
 80 | 
 81 |   /**
 82 |    * Initialise the domain index toggle buttons
 83 |    */
 84 |   initDomainIndexTable: () => {
 85 |     const toggler = (el) => {
 86 |       const idNumber = el.id.substr(7);
 87 |       const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
 88 |       if (el.src.substr(-9) === "minus.png") {
 89 |         el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
 90 |         toggledRows.forEach((el) => (el.style.display = "none"));
 91 |       } else {
 92 |         el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
 93 |         toggledRows.forEach((el) => (el.style.display = ""));
 94 |       }
 95 |     };
 96 | 
 97 |     const togglerElements = document.querySelectorAll("img.toggler");
 98 |     togglerElements.forEach((el) =>
 99 |       el.addEventListener("click", (event) => toggler(event.currentTarget))
100 |     );
101 |     togglerElements.forEach((el) => (el.style.display = ""));
102 |     if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
103 |   },
104 | 
105 |   initOnKeyListeners: () => {
106 |     // only install a listener if it is really needed
107 |     if (
108 |       !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
109 |       !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
110 |     )
111 |       return;
112 | 
113 |     document.addEventListener("keydown", (event) => {
114 |       // bail for input elements
115 |       if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
116 |       // bail with special keys
117 |       if (event.altKey || event.ctrlKey || event.metaKey) return;
118 | 
119 |       if (!event.shiftKey) {
120 |         switch (event.key) {
121 |           case "ArrowLeft":
122 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
123 | 
124 |             const prevLink = document.querySelector('link[rel="prev"]');
125 |             if (prevLink && prevLink.href) {
126 |               window.location.href = prevLink.href;
127 |               event.preventDefault();
128 |             }
129 |             break;
130 |           case "ArrowRight":
131 |             if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
132 | 
133 |             const nextLink = document.querySelector('link[rel="next"]');
134 |             if (nextLink && nextLink.href) {
135 |               window.location.href = nextLink.href;
136 |               event.preventDefault();
137 |             }
138 |             break;
139 |         }
140 |       }
141 | 
142 |       // some keyboard layouts may need Shift to get /
143 |       switch (event.key) {
144 |         case "/":
145 |           if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
146 |           Documentation.focusSearchBar();
147 |           event.preventDefault();
148 |       }
149 |     });
150 |   },
151 | };
152 | 
153 | // quick alias for translations
154 | const _ = Documentation.gettext;
155 | 
156 | _ready(Documentation.init);
157 | 
```

--------------------------------------------------------------------------------
/docs/_sources/guides/httrack.rst.txt:
--------------------------------------------------------------------------------

```
  1 | HTTrack MCP Setup Guide
  2 | ========================
  3 | 
  4 | Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with `HTTrack Website Copier <https://www.httrack.com/>`_.
  5 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've mirrored using HTTrack.
  6 | 
  7 | .. raw:: html
  8 | 
  9 |    <iframe width="560" height="315" src="https://www.youtube.com/embed/HAVfvmrZjRk" frameborder="0" allowfullscreen></iframe>
 10 | 
 11 | Follow along with the video, or the step-action guide below.
 12 | 
 13 | Requirements
 14 | ------------
 15 | 
 16 | Before you begin, ensure you have:
 17 | 
 18 | - `Claude Desktop <https://claude.ai/download>`_ installed
 19 | - `Python <https://python.org>`_ 3.10 or later installed
 20 | - `HTTrack Website Copier <https://www.httrack.com/>`_ installed
 21 | - Basic familiarity with command line interfaces
 22 | 
 23 | What is HTTrack?
 24 | ----------------
 25 | 
 26 | HTTrack is a well-established open source website mirror tool that offers:
 27 | 
 28 | - Complete website mirroring with organized project directories
 29 | - User-friendly wizard-style interface for setup
 30 | - Comprehensive content capture including HTML, CSS, images, and other assets
 31 | - Ability to manage multiple site mirrors efficiently
 32 | - Cross-platform support (Windows, macOS, Linux)
 33 | 
 34 | Installation Steps
 35 | ------------------
 36 | 
 37 | 1. Install mcp-server-webcrawl
 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 39 | 
 40 | Open your terminal or command line and install the package::
 41 | 
 42 |     pip install mcp-server-webcrawl
 43 | 
 44 | Verify installation was successful::
 45 | 
 46 |     mcp-server-webcrawl --help
 47 | 
 48 | 2. Create Website Mirrors with HTTrack
 49 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 50 | 
 51 | 1. Open HTTrack Website Copier application
 52 | 2. Create a new project (e.g., "example") and specify where to save it
 53 | 3. Add the URL you want to mirror (e.g., https://example.com)
 54 | 4. Use the wizard interface to configure your crawling options
 55 | 5. Start the mirroring process and wait for completion
 56 | 6. Repeat for additional sites as needed (e.g., create another project for pragmar.com)
 57 | 
 58 | HTTrack will create organized project directories under your specified location (typically "My Web Sites" on Windows or "websites" on macOS/Linux). Each project contains the complete website mirror with all HTML files, images, CSS, and other assets properly organized.
 59 | 
 60 | 3. Configure Claude Desktop
 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 62 | 
 63 | 1. Open Claude Desktop
 64 | 2. Go to **File → Settings → Developer → Edit Config**
 65 | 3. Add the following configuration (modify paths as needed):
 66 | 
 67 | .. code-block:: json
 68 | 
 69 |     {
 70 |       "mcpServers": {
 71 |         "webcrawl": {
 72 |           "command": "/path/to/mcp-server-webcrawl",
 73 |           "args": ["--crawler", "httrack", "--datasrc",
 74 |             "/path/to/httrack/projects/"]
 75 |         }
 76 |       }
 77 |     }
 78 | 
 79 | .. note::
 80 |    - On macOS/Linux, use the absolute path (output of ``which mcp-server-webcrawl``), and the default path is typically ``"~/websites"``
 81 |    - The datasrc path should point to your HTTrack project directory containing all your mirrored sites
 82 | 
 83 | 4. Save the file and **completely exit** Claude Desktop (not just close the window)
 84 | 5. Restart Claude Desktop
 85 | 
 86 | 4. Verify and Use
 87 | ~~~~~~~~~~~~~~~~~
 88 | 
 89 | 1. In Claude Desktop, you should now see MCP tools available under Search and Tools
 90 | 2. Ask Claude to list your crawled sites::
 91 | 
 92 |     Can you list the crawled sites available?
 93 | 
 94 | 3. Try searching content from your crawls::
 95 | 
 96 |     Can you find information about [topic] on [crawled site]?
 97 | 
 98 | 4. Conduct content audits and SEO analysis::
 99 | 
100 |     Can you analyze the content structure and SEO elements for [crawled site]?
101 | 
102 | Troubleshooting
103 | ---------------
104 | 
105 | - If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
106 | - Ensure Python and mcp-server-webcrawl are properly installed
107 | - Check that your HTTrack project directory path in the configuration is correct
108 | - Make sure HTTrack has successfully completed mirroring the websites and created the project directories
109 | - Remember that the first time you use a function, Claude will ask for permission
110 | - For large websites, initial indexing may take some time during the first search
111 | 
112 | HTTrack's project structure makes it easy to manage multiple site mirrors, and when combined with mcp-server-webcrawl, provides for content analysis, SEO audits, and searchable archives.
113 | 
114 | For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```

--------------------------------------------------------------------------------
/docs/_sources/guides/katana.rst.txt:
--------------------------------------------------------------------------------

```
  1 | Katana MCP Setup Guide
  2 | ======================
  3 | 
  4 | Instructions for setting up `mcp-server-webcrawl <https://pragmar.com/mcp-server-webcrawl/>`_ with `Katana <https://github.com/projectdiscovery/katana>`_ crawler.
  5 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you've crawled using Katana.
  6 | 
  7 | .. raw:: html
  8 | 
  9 |    <iframe width="560" height="315" src="https://www.youtube.com/embed/sOMaojm0R0Y" frameborder="0" allowfullscreen></iframe>
 10 | 
 11 | Follow along with the video, or the step-action guide below.
 12 | 
 13 | Requirements
 14 | ------------
 15 | 
 16 | Before you begin, ensure you have:
 17 | 
 18 | - `Claude Desktop <https://claude.ai/download>`_ installed
 19 | - `Python <https://python.org>`_ 3.10 or later installed
 20 | - `Go programming language <https://go.dev/doc/install>`_ installed
 21 | - `Katana crawler <https://github.com/projectdiscovery/katana>`_ installed
 22 | 
 23 | - Basic familiarity with command line interfaces
 24 | 
 25 | What is Katana?
 26 | ---------------
 27 | 
 28 | Katana is an open-source web crawler from Project Discovery that offers:
 29 | 
 30 | - Fast and efficient web crawling capabilities
 31 | - Command-line interface for flexibility and automation
 32 | - Highly configurable crawling parameters
 33 | - Ability to store complete HTTP responses for analysis
 34 | 
 35 | Installation Steps
 36 | ------------------
 37 | 
 38 | 1. Install mcp-server-webcrawl
 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | Open your terminal or command line and install the package::
 42 | 
 43 |     pip install mcp-server-webcrawl
 44 | 
 45 | Verify installation was successful::
 46 | 
 47 |     mcp-server-webcrawl --help
 48 | 
 49 | 2. Install and Run Katana
 50 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 51 | 
 52 | 1. Verify Go is installed and on your PATH::
 53 | 
 54 |     go version
 55 | 
 56 | 2. Install Katana using Go::
 57 | 
 58 |     go install github.com/projectdiscovery/katana/cmd/katana@latest
 59 | 
 60 | 3. Create a directory for your crawls and run Katana with storage options::
 61 | 
 62 |     # Create a directory for storing crawls
 63 |     mkdir crawls
 64 | 
 65 |     # Run Katana with storage options
 66 |     katana -u https://example.com -store-response -store-response-dir archives/example.com/
 67 | 
 68 | 4. Repeat for additional websites as needed::
 69 | 
 70 |     katana -u https://pragmar.com -store-response -store-response-dir archives/pragmar.com/
 71 | 
 72 | In this case, the ./archives directory is the datasrc. The crawler will create
 73 | a separate host directory for each unique host within
 74 | the specified directory. This is consistent with the behavior of Katana,
 75 | example.com/example.com is expected. Sites with external dependencies will branch
 76 | out by origin host in the -store-response-dir, and continue to be searchable as a
 77 | singular site search.
 78 | 
 79 | 3. Configure Claude Desktop
 80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 81 | 
 82 | 1. Open Claude Desktop
 83 | 2. Go to **File → Settings → Developer → Edit Config**
 84 | 3. Add the following configuration (modify paths as needed):
 85 | 
 86 | .. code-block:: json
 87 | 
 88 |     {
 89 |       "mcpServers": {
 90 |         "webcrawl": {
 91 |           "command": "/path/to/mcp-server-webcrawl",
 92 |           "args": ["--crawler", "katana", "--datasrc",
 93 |             "/path/to/katana/crawls/"]
 94 |         }
 95 |       }
 96 |     }
 97 | 
 98 | .. note::
 99 |    - On Windows, use ``"mcp-server-webcrawl"`` as the command
100 |    - On macOS, use the absolute path (output of ``which mcp-server-webcrawl``)
101 |    - Change ``/path/to/katana/crawls/`` to the actual path where you stored your Katana crawls
102 | 
103 | 4. Save the file and **completely exit** Claude Desktop (not just close the window)
104 | 5. Restart Claude Desktop
105 | 
106 | 4. Verify and Use
107 | ~~~~~~~~~~~~~~~~~
108 | 
109 | 1. In Claude Desktop, you should now see MCP tools available under Search and Tools
110 | 2. Ask Claude to list your crawled sites::
111 | 
112 |     Can you list the crawled sites available?
113 | 
114 | 3. Try searching content from your crawls::
115 | 
116 |     Can you find information about [topic] on [crawled site]?
117 | 
118 | 4. Try specialized searches that use Katana's comprehensive data collection::
119 | 
120 |     Can you find all the help pages on this site and tell me how they're different?
121 | 
122 | Troubleshooting
123 | ---------------
124 | 
125 | - If Claude doesn't show MCP tools after restart, verify your configuration file is correctly formatted
126 | - Ensure Python and mcp-server-webcrawl are properly installed
127 | - Check that your Katana crawls directory path in the configuration is correct
128 | - Make sure the ``-store-response`` flag was used during crawling, as this is required to save content
129 | - Verify that each crawl completed successfully and files were saved to the expected location
130 | - Remember that the first time you use a function, Claude will ask for permission
131 | 
132 | For more details, including API documentation and other crawler options, visit the `mcp-server-webcrawl documentation <https://github.com/pragmar/mcp-server-webcrawl>`_.
```