#
tokens: 48122/50000 22/216 files (page 2/33)
lines: off (toggle) GitHub
raw markdown copy
This is page 2 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl/crawlers/siteone/prompts.html?page={x} to view the full context.

# Directory Structure

```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│   ├── _images
│   │   ├── interactive.document.webp
│   │   ├── interactive.search.webp
│   │   └── mcpswc.svg
│   ├── _modules
│   │   ├── index.html
│   │   ├── mcp_server_webcrawl
│   │   │   ├── crawlers
│   │   │   │   ├── archivebox
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── base
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── api.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   ├── indexed.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── httrack
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── interrobot
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── katana
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── siteone
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── warc
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   └── wget
│   │   │   │       ├── adapter.html
│   │   │   │       ├── crawler.html
│   │   │   │       └── tests.html
│   │   │   ├── crawlers.html
│   │   │   ├── extras
│   │   │   │   ├── markdown.html
│   │   │   │   ├── regex.html
│   │   │   │   ├── snippets.html
│   │   │   │   ├── thumbnails.html
│   │   │   │   └── xpath.html
│   │   │   ├── interactive
│   │   │   │   ├── highlights.html
│   │   │   │   ├── search.html
│   │   │   │   ├── session.html
│   │   │   │   └── ui.html
│   │   │   ├── main.html
│   │   │   ├── models
│   │   │   │   ├── resources.html
│   │   │   │   └── sites.html
│   │   │   ├── templates
│   │   │   │   └── tests.html
│   │   │   ├── utils
│   │   │   │   ├── blobs.html
│   │   │   │   ├── cli.html
│   │   │   │   ├── logger.html
│   │   │   │   ├── querycache.html
│   │   │   │   ├── server.html
│   │   │   │   └── tools.html
│   │   │   └── utils.html
│   │   └── re.html
│   ├── _sources
│   │   ├── guides
│   │   │   ├── archivebox.rst.txt
│   │   │   ├── httrack.rst.txt
│   │   │   ├── interrobot.rst.txt
│   │   │   ├── katana.rst.txt
│   │   │   ├── siteone.rst.txt
│   │   │   ├── warc.rst.txt
│   │   │   └── wget.rst.txt
│   │   ├── guides.rst.txt
│   │   ├── index.rst.txt
│   │   ├── installation.rst.txt
│   │   ├── interactive.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.base.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│   │   ├── mcp_server_webcrawl.extras.rst.txt
│   │   ├── mcp_server_webcrawl.interactive.rst.txt
│   │   ├── mcp_server_webcrawl.models.rst.txt
│   │   ├── mcp_server_webcrawl.rst.txt
│   │   ├── mcp_server_webcrawl.templates.rst.txt
│   │   ├── mcp_server_webcrawl.utils.rst.txt
│   │   ├── modules.rst.txt
│   │   ├── prompts.rst.txt
│   │   └── usage.rst.txt
│   ├── _static
│   │   ├── _sphinx_javascript_frameworks_compat.js
│   │   ├── basic.css
│   │   ├── css
│   │   │   ├── badge_only.css
│   │   │   ├── fonts
│   │   │   │   ├── fontawesome-webfont.eot
│   │   │   │   ├── fontawesome-webfont.svg
│   │   │   │   ├── fontawesome-webfont.ttf
│   │   │   │   ├── fontawesome-webfont.woff
│   │   │   │   ├── fontawesome-webfont.woff2
│   │   │   │   ├── lato-bold-italic.woff
│   │   │   │   ├── lato-bold-italic.woff2
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-normal-italic.woff
│   │   │   │   ├── lato-normal-italic.woff2
│   │   │   │   ├── lato-normal.woff
│   │   │   │   ├── lato-normal.woff2
│   │   │   │   ├── Roboto-Slab-Bold.woff
│   │   │   │   ├── Roboto-Slab-Bold.woff2
│   │   │   │   ├── Roboto-Slab-Regular.woff
│   │   │   │   └── Roboto-Slab-Regular.woff2
│   │   │   └── theme.css
│   │   ├── doctools.js
│   │   ├── documentation_options.js
│   │   ├── file.png
│   │   ├── fonts
│   │   │   ├── Lato
│   │   │   │   ├── lato-bold.eot
│   │   │   │   ├── lato-bold.ttf
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-bolditalic.eot
│   │   │   │   ├── lato-bolditalic.ttf
│   │   │   │   ├── lato-bolditalic.woff
│   │   │   │   ├── lato-bolditalic.woff2
│   │   │   │   ├── lato-italic.eot
│   │   │   │   ├── lato-italic.ttf
│   │   │   │   ├── lato-italic.woff
│   │   │   │   ├── lato-italic.woff2
│   │   │   │   ├── lato-regular.eot
│   │   │   │   ├── lato-regular.ttf
│   │   │   │   ├── lato-regular.woff
│   │   │   │   └── lato-regular.woff2
│   │   │   └── RobotoSlab
│   │   │       ├── roboto-slab-v7-bold.eot
│   │   │       ├── roboto-slab-v7-bold.ttf
│   │   │       ├── roboto-slab-v7-bold.woff
│   │   │       ├── roboto-slab-v7-bold.woff2
│   │   │       ├── roboto-slab-v7-regular.eot
│   │   │       ├── roboto-slab-v7-regular.ttf
│   │   │       ├── roboto-slab-v7-regular.woff
│   │   │       └── roboto-slab-v7-regular.woff2
│   │   ├── images
│   │   │   ├── interactive.document.png
│   │   │   ├── interactive.document.webp
│   │   │   ├── interactive.search.png
│   │   │   ├── interactive.search.webp
│   │   │   └── mcpswc.svg
│   │   ├── jquery.js
│   │   ├── js
│   │   │   ├── badge_only.js
│   │   │   ├── theme.js
│   │   │   └── versions.js
│   │   ├── language_data.js
│   │   ├── minus.png
│   │   ├── plus.png
│   │   ├── pygments.css
│   │   ├── searchtools.js
│   │   └── sphinx_highlight.js
│   ├── .buildinfo
│   ├── .nojekyll
│   ├── genindex.html
│   ├── guides
│   │   ├── archivebox.html
│   │   ├── httrack.html
│   │   ├── interrobot.html
│   │   ├── katana.html
│   │   ├── siteone.html
│   │   ├── warc.html
│   │   └── wget.html
│   ├── guides.html
│   ├── index.html
│   ├── installation.html
│   ├── interactive.html
│   ├── mcp_server_webcrawl.crawlers.archivebox.html
│   ├── mcp_server_webcrawl.crawlers.base.html
│   ├── mcp_server_webcrawl.crawlers.html
│   ├── mcp_server_webcrawl.crawlers.httrack.html
│   ├── mcp_server_webcrawl.crawlers.interrobot.html
│   ├── mcp_server_webcrawl.crawlers.katana.html
│   ├── mcp_server_webcrawl.crawlers.siteone.html
│   ├── mcp_server_webcrawl.crawlers.warc.html
│   ├── mcp_server_webcrawl.crawlers.wget.html
│   ├── mcp_server_webcrawl.extras.html
│   ├── mcp_server_webcrawl.html
│   ├── mcp_server_webcrawl.interactive.html
│   ├── mcp_server_webcrawl.models.html
│   ├── mcp_server_webcrawl.templates.html
│   ├── mcp_server_webcrawl.utils.html
│   ├── modules.html
│   ├── objects.inv
│   ├── prompts.html
│   ├── py-modindex.html
│   ├── search.html
│   ├── searchindex.js
│   └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│   ├── audit404.md
│   ├── auditfiles.md
│   ├── auditperf.md
│   ├── auditseo.md
│   ├── gopher.md
│   ├── README.md
│   └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│   ├── _static
│   │   └── images
│   │       ├── interactive.document.png
│   │       ├── interactive.document.webp
│   │       ├── interactive.search.png
│   │       ├── interactive.search.webp
│   │       └── mcpswc.svg
│   ├── _templates
│   │   └── layout.html
│   ├── conf.py
│   ├── guides
│   │   ├── archivebox.rst
│   │   ├── httrack.rst
│   │   ├── interrobot.rst
│   │   ├── katana.rst
│   │   ├── siteone.rst
│   │   ├── warc.rst
│   │   └── wget.rst
│   ├── guides.rst
│   ├── index.rst
│   ├── installation.rst
│   ├── interactive.rst
│   ├── make.bat
│   ├── Makefile
│   ├── mcp_server_webcrawl.crawlers.archivebox.rst
│   ├── mcp_server_webcrawl.crawlers.base.rst
│   ├── mcp_server_webcrawl.crawlers.httrack.rst
│   ├── mcp_server_webcrawl.crawlers.interrobot.rst
│   ├── mcp_server_webcrawl.crawlers.katana.rst
│   ├── mcp_server_webcrawl.crawlers.rst
│   ├── mcp_server_webcrawl.crawlers.siteone.rst
│   ├── mcp_server_webcrawl.crawlers.warc.rst
│   ├── mcp_server_webcrawl.crawlers.wget.rst
│   ├── mcp_server_webcrawl.extras.rst
│   ├── mcp_server_webcrawl.interactive.rst
│   ├── mcp_server_webcrawl.models.rst
│   ├── mcp_server_webcrawl.rst
│   ├── mcp_server_webcrawl.templates.rst
│   ├── mcp_server_webcrawl.utils.rst
│   ├── modules.rst
│   ├── prompts.rst
│   ├── readme.txt
│   └── usage.rst
└── src
    └── mcp_server_webcrawl
        ├── __init__.py
        ├── crawlers
        │   ├── __init__.py
        │   ├── archivebox
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── base
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── api.py
        │   │   ├── crawler.py
        │   │   ├── indexed.py
        │   │   └── tests.py
        │   ├── httrack
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── interrobot
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── katana
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── siteone
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── warc
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   └── wget
        │       ├── __init__.py
        │       ├── adapter.py
        │       ├── crawler.py
        │       └── tests.py
        ├── extras
        │   ├── __init__.py
        │   ├── markdown.py
        │   ├── regex.py
        │   ├── snippets.py
        │   ├── thumbnails.py
        │   └── xpath.py
        ├── interactive
        │   ├── __init__.py
        │   ├── highlights.py
        │   ├── search.py
        │   ├── session.py
        │   ├── ui.py
        │   └── views
        │       ├── base.py
        │       ├── document.py
        │       ├── help.py
        │       ├── requirements.py
        │       ├── results.py
        │       └── searchform.py
        ├── main.py
        ├── models
        │   ├── __init__.py
        │   ├── base.py
        │   ├── resources.py
        │   └── sites.py
        ├── settings.py
        ├── templates
        │   ├── __init__.py
        │   ├── markdown.xslt
        │   ├── tests_core.html
        │   └── tests.py
        └── utils
            ├── __init__.py
            ├── cli.py
            ├── logger.py
            ├── parser.py
            ├── parsetab.py
            ├── search.py
            ├── server.py
            ├── tests.py
            └── tools.py
```

# Files

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/templates/tests_core.html:
--------------------------------------------------------------------------------

```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>HTML Basic Page</title>
    <!--
    tests depend on this file being stable, don't update without planning of updating tests.py
    this file is transformed to markdown to test transform integrity
    -->
    <style>
        body { font-family: Georgia, serif; line-height: 1.6; margin: 2em; background: #fafafa; }
        h1 { color: #333; border-bottom: 2px solid #666; }
        h2 { color: #555; margin-top: 2em; }
        h3 { color: #777; }
        h4, h5, h6 { color: #888; }
        p { margin-bottom: 1em; }
        a { color: #0066cc; text-decoration: underline; }
        a:hover { color: #004499; }
        em { font-style: italic; color: #666; }
        strong, b { font-weight: bold; }
        i { font-style: italic; }
        ul, ol { margin: 1em 0; padding-left: 2em; }
        li { margin-bottom: 0.5em; }
        table { border-collapse: collapse; width: 100%; margin: 1em 0; }
        th, td { border: 1px solid #ccc; padding: 0.5em; text-align: left; }
        th { background: #f0f0f0; font-weight: bold; }
        blockquote { margin: 1em 2em; padding-left: 1em; border-left: 3px solid #ccc; font-style: italic; }
        code { background: #f5f5f5; padding: 0.2em 0.4em; font-family: monospace; }
        pre { background: #f5f5f5; padding: 1em; overflow-x: auto; }
        hr { border: none; border-top: 1px solid #ccc; margin: 2em 0; }
        dl { margin: 1em 0; }
        dt { font-weight: bold; margin-top: 0.5em; }
        dd { margin-left: 2em; margin-bottom: 0.5em; }
    </style>
</head>
<body>
    <h1>Lorem Ipsum Dolor Sit Amet</h1>
    <p>Lorem ipsum dolor sit amet, <strong>consectetur adipiscing elit</strong>. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, <a href="#nowhere">quis nostrud exercitation</a> ullamco laboris nisi ut aliquip ex ea commodo consequat. <em>Duis aute irure dolor</em> in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p>

    <h2>Consectetur Adipiscing Elit</h2>
    <p>Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. <b>Sed ut perspiciatis</b> unde omnis iste natus error sit voluptatem accusantium doloremque laudantium. <i>Totam rem aperiam</i>, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo.</p>

    <h3>Nemo Enim Ipsam Voluptatem</h3>
    <p>Quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit.</p>

    <h4>Sed Quia Non Numquam</h4>
    <p>Eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam.</p>

    <h5>Nisi Ut Aliquid Ex Ea</h5>
    <p>Commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?</p>

    <h6>At Vero Eos Et Accusamus</h6>
    <p>Et iusto odio dignissimos ducimus qui blanditiis praesentium voluptatum deleniti atque corrupti quos dolores et quas molestias excepturi sint occaecati cupiditate non provident.</p>

    <hr>

    <h2>Unordered List Example</h2>
    <ul>
        <li>Similique sunt in culpa qui officia deserunt</li>
        <li>Mollitia animi, id est laborum et dolorum fuga</li>
        <li>Et harum quidem rerum facilis est et expedita distinctio</li>
        <li>Nam libero tempore, cum soluta nobis est eligendi optio</li>
        <li>Cumque nihil impedit quo minus id quod maxime</li>
    </ul>

    <h2>Ordered List Example</h2>
    <ol>
        <li>Temporibus autem quibusdam et aut officiis debitis</li>
        <li>Aut reiciendis voluptatibus maiores alias consequatur</li>
        <li>Aut perferendis doloribus asperiores repellat</li>
        <li>Itaque earum rerum hic tenetur a sapiente delectus</li>
        <li>Ut aut reiciendis voluptatibus maiores alias</li>
    </ol>

    <h2>Definition List Example</h2>
    <dl>
        <dt>Lorem Ipsum</dt>
        <dd>Dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</dd>

        <dt>Ut Enim</dt>
        <dd>Ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</dd>

        <dt>Duis Aute</dt>
        <dd>Irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</dd>
    </dl>

    <h2>Table Example</h2>
    <table>
        <thead>
            <tr>
                <th>Lorem</th>
                <th>Ipsum</th>
                <th>Dolor</th>
                <th>Sit</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Consectetur</td>
                <td>Adipiscing</td>
                <td>Elit</td>
                <td>Sed</td>
            </tr>
            <tr>
                <td>Eiusmod</td>
                <td>Tempor</td>
                <td>Incididunt</td>
                <td>Labore</td>
            </tr>
            <tr>
                <td>Dolore</td>
                <td>Magna</td>
                <td>Aliqua</td>
                <td>Enim</td>
            </tr>
            <tr>
                <td>Minim</td>
                <td>Veniam</td>
                <td>Quis</td>
                <td>Nostrud</td>
            </tr>
        </tbody>
    </table>

    <h2>More Text Elements</h2>
    <p>Here we have some <code>inline code</code> and a longer code block below:</p>

    <pre><code>function lorem() {
    return "ipsum dolor sit amet";
}</code></pre>

    <blockquote>
        <p>"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis."</p>
    </blockquote>

    <p>Final paragraph with mixed formatting: <strong>bold text</strong>, <em>emphasized text</em>, <i>italic text</i>, <b>more bold</b>, and a <a href="#top">link back to top</a>. Nam libero tempore, cum soluta nobis est eligendi optio cumque nihil impedit quo minus id quod maxime placeat facere possimus.</p>
</body>
</html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/highlights.py:
--------------------------------------------------------------------------------

```python
import re
import curses

from dataclasses import dataclass
from typing import List

from mcp_server_webcrawl.interactive.ui import safe_addstr

@dataclass
class HighlightSpan:
    """
    Represents a highlight span in text
    """
    start: int
    end: int
    text: str

    def __str__(self) -> str:
        return f"[{self.start}:{self.end} '{self.text}']"


class HighlightProcessor:
    """
    Shared highlight processing utilities
    """

    QUOTED_PHRASE_PATTERN = re.compile(r'"([^"]+)"')
    WORD_PATTERN = re.compile(r"\b\w+\b")
    SNIPPET_MARKER_PATTERN = re.compile(r"\*\*([a-zA-Z\-_' ]+)\*\*")
    IGNORE_WORDS = {"AND", "OR", "NOT", "and", "or", "not", "type", "status", "size", "url", "id"}

    @staticmethod
    def extract_search_terms(query: str) -> List[str]:
        """
        Extract search terms from query, handling quoted phrases and individual keywords.
        """
        if not query or not query.strip():
            return []

        search_terms = []
        for match in HighlightProcessor.QUOTED_PHRASE_PATTERN.finditer(query):
            phrase = match.group(1).strip()
            if phrase:
                search_terms.append(phrase)

        remaining_query = HighlightProcessor.QUOTED_PHRASE_PATTERN.sub('', query)

        # extract individual words
        for match in HighlightProcessor.WORD_PATTERN.finditer(remaining_query):
            word = match.group().strip()
            if word and word not in HighlightProcessor.IGNORE_WORDS and len(word) > 2:
                search_terms.append(word)

        return search_terms

    @staticmethod
    def find_highlights_in_text(text: str, search_terms: List[str]) -> List[HighlightSpan]:
        """
        Find all highlight spans in text for the given search terms.
        """
        if not text or not search_terms:
            return []

        highlights = []
        escaped_terms = [re.escape(term.strip("\"'")) for term in search_terms]
        pattern = re.compile(rf"\b({'|'.join(escaped_terms)})\b", re.IGNORECASE)

        for match in pattern.finditer(text):
            span = HighlightSpan(
                start=match.start(),
                end=match.end(),
                text=match.group()
            )
            highlights.append(span)

        return HighlightProcessor.merge_overlapping_highlights(highlights, text)

    @staticmethod
    def extract_snippet_highlights(snippet_text: str) -> tuple[str, List[HighlightSpan]]:
        """
        Extract highlights from snippet text with **markers**, returning clean text and highlights.
        """
        if not snippet_text:
            return "", []

        normalized_text = re.sub(r"\s+", " ", snippet_text.strip())

        clean_text = ""
        highlights = []
        last_end = 0

        for match in HighlightProcessor.SNIPPET_MARKER_PATTERN.finditer(normalized_text):
            # text before this match
            clean_text += normalized_text[last_end:match.start()]

            # highlighted text (without markers)
            highlight_text = match.group(1)
            highlight_start = len(clean_text)
            clean_text += highlight_text
            highlight_end = len(clean_text)

            span: HighlightSpan = HighlightSpan(
                start=highlight_start,
                end=highlight_end,
                text=highlight_text
            )
            highlights.append(span)
            last_end = match.end()

        # remaining text
        clean_text += normalized_text[last_end:]

        return clean_text.strip(), highlights

    @staticmethod
    def merge_overlapping_highlights(highlights: List[HighlightSpan], text: str) -> List[HighlightSpan]:
        """Merge overlapping or adjacent highlight spans."""
        if not highlights:
            return []

        # sort by start position
        sorted_highlights = sorted(highlights, key=lambda h: h.start)
        merged = []

        for highlight in sorted_highlights:
            if not merged:
                merged.append(highlight)
            else:
                last = merged[-1]
                if highlight.start <= last.end:
                    # overlapping/adjacent - merge them
                    end = max(last.end, highlight.end)
                    merged_text = text[last.start:end]
                    merged[-1] = HighlightSpan(
                        start=last.start,
                        end=end,
                        text=merged_text
                    )
                else:
                    merged.append(highlight)

        return merged

    @staticmethod
    def render_text_with_highlights(
        stdscr: curses.window,
        text: str,
        highlights: List[HighlightSpan],
        x: int,
        y: int,
        max_width: int,
        normal_style: int,
        hit_style: int
    ) -> None:
        """
        Render text with highlights applied.
        """
        if not text.strip():
            return

        display_text: str = text[:max_width] if len(text) > max_width else text
        visible_highlights: list[str] = [h for h in highlights if h.start < len(display_text)]
        current_x: int = x
        pos: int = 0

        try:
            for highlight in visible_highlights:
                # text before highlight
                if highlight.start > pos:
                    text_before: str = display_text[pos:highlight.start]
                    safe_addstr(stdscr, y, current_x, text_before, normal_style)
                    current_x += len(text_before)
                    pos = highlight.start

                # highlighted text
                highlight_end: int = min(highlight.end, len(display_text))
                highlighted_text: str = display_text[highlight.start:highlight_end]
                if current_x + len(highlighted_text) <= x + max_width:
                    safe_addstr(stdscr, y, current_x, highlighted_text, hit_style)
                    current_x += len(highlighted_text)
                pos = highlight_end

            # remaining text
            if pos < len(display_text):
                remaining_text: str = display_text[pos:]
                remaining_width: int = max_width - (current_x - x)
                if remaining_width > 0:
                    safe_addstr(stdscr, y, current_x, remaining_text[:remaining_width], normal_style)

        except curses.error:
            pass

```

--------------------------------------------------------------------------------
/docs/guides.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Setup Guides &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />

  
      <script src="_static/jquery.js?v=5d32c60e"></script>
      <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="_static/documentation_options.js?v=5929fcd5"></script>
      <script src="_static/doctools.js?v=888ff710"></script>
      <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="ArchiveBox MCP Setup Guide" href="guides/archivebox.html" />
    <link rel="prev" title="Installation" href="installation.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Setup Guides</a><ul>
<li class="toctree-l2"><a class="reference internal" href="guides/archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="guides/httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="guides/interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="guides/katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="guides/siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="guides/warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="guides/wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Setup Guides</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/guides.rst.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <section id="setup-guides">
<h1>Setup Guides<a class="headerlink" href="#setup-guides" title="Link to this heading"></a></h1>
<p>This section contains detailed setup guides for mcp-server-webcrawl in various environments and configurations.</p>
<div class="toctree-wrapper compound">
<p class="caption" role="heading"><span class="caption-text">Available Guides:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="guides/archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides/httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides/interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides/katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides/siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides/warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides/wget.html">wget MCP Setup Guide</a></li>
</ul>
</div>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="installation.html" class="btn btn-neutral float-left" title="Installation" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="guides/archivebox.html" class="btn btn-neutral float-right" title="ArchiveBox MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/models/resources.py:
--------------------------------------------------------------------------------

```python
from enum import Enum
from typing import Final
from datetime import datetime

from mcp_server_webcrawl.models.base import BaseModel, METADATA_VALUE_TYPE
from mcp_server_webcrawl.utils import to_isoformat_zulu

RESOURCES_TOOL_NAME: Final[str] = "webcrawl_search"
RESOURCE_EXTRAS_ALLOWED: Final[set[str]] = {"markdown", "snippets", "regex", "thumbnails", "xpath"}
RESOURCES_LIMIT_DEFAULT: Final[int] = 20
RESOURCES_LIMIT_MAX: Final[int] = 100

RESOURCES_FIELDS_BASE: Final[list[str]] = ["id", "url", "site", "type", "status"]
RESOURCES_FIELDS_DEFAULT: Final[list[str]] = RESOURCES_FIELDS_BASE + ["created", "modified"]
RESOURCES_FIELDS_OPTIONS: Final[list[str]] = ["created", "modified", "size", "headers", "content"]

RESOURCES_DEFAULT_FIELD_MAPPING: Final[dict[str, str]] = {
    "id": "ResourcesFullText.Id",
    "site": "ResourcesFullText.Project",
    "created": "Resources.Created",
    "modified": "Resources.Modified",
    "url": "ResourcesFullText.Url",
    "status": "Resources.Status",
    "size": "Resources.Size",
    "type": "ResourcesFullText.Type",
    "headers": "ResourcesFullText.Headers",
    "content": "ResourcesFullText.Content",
    "time": "Resources.Time",
    "fulltext": "ResourcesFullText",
}
RESOURCES_DEFAULT_SORT_MAPPING: Final[dict[str, tuple[str, str]]] = {
    "+id": ("Resources.Id", "ASC"),
    "-id": ("Resources.Id", "DESC"),
    "+url": ("ResourcesFullText.Url", "ASC"),
    "-url": ("ResourcesFullText.Url", "DESC"),
    "+status": ("Resources.Status", "ASC"),
    "-status": ("Resources.Status", "DESC"),
    "+size": ("Resources.Size", "ASC"),
    "-size": ("Resources.Size", "DESC"),
    "?": ("Resources.Id", "RANDOM")
}

class ResourceResultType(Enum):
    """
    Enum representing different types of web resources.
    """
    UNDEFINED = ""
    PAGE = "html"
    FRAME = "iframe"
    IMAGE = "img"
    AUDIO = "audio"
    VIDEO = "video"
    FONT = "font"
    CSS = "style"
    SCRIPT = "script"
    FEED = "rss"
    TEXT = "text"
    PDF = "pdf"
    DOC = "doc"
    OTHER = "other"

    @classmethod
    def values(cls) -> list[str]:
        """
        Return all values of the enum as a list.
        """
        return [member.value for member in cls]

    @classmethod
    def to_int_map(cls):
        """
        Return a dictionary mapping each enum value to its integer position.

        Returns:
            dict: a dictionary with enum values as keys and their ordinal positions as values.
        """
        return {member.value: i for i, member in enumerate(cls)}

# if types stored as ints within db
RESOURCES_ENUMERATED_TYPE_MAPPING: Final[dict[int, ResourceResultType]] = {
    0: ResourceResultType.UNDEFINED,
    1: ResourceResultType.PAGE,
    2: ResourceResultType.OTHER,
    3: ResourceResultType.FEED,
    4: ResourceResultType.FRAME,
    5: ResourceResultType.OTHER,
    6: ResourceResultType.IMAGE,
    7: ResourceResultType.AUDIO,
    8: ResourceResultType.VIDEO,
    9: ResourceResultType.FONT,
    10: ResourceResultType.CSS,
    11: ResourceResultType.SCRIPT,
    12: ResourceResultType.OTHER,
    13: ResourceResultType.TEXT,
    14: ResourceResultType.PDF,
    15: ResourceResultType.DOC
}

class ResourceResult(BaseModel):
    """
    Represents a web resource result from a crawl operation.
    """
    def __init__(
        self,
        id: int,
        url: str,
        site: int | None = None,
        crawl: int | None = None,
        type: ResourceResultType = ResourceResultType.UNDEFINED,
        name: str | None = None,
        headers: str | None = None,
        content: str | None = None,
        created: datetime | None = None,
        modified: datetime | None = None,
        status: int | None = None,
        size: int | None = None,
        time: int | None = None,
        metadata: dict[str, METADATA_VALUE_TYPE] | None = None,
    ):
        """
        Initialize a ResourceResult instance.

        Args:
            id: resource identifier
            url: resource URL
            site: site identifier the resource belongs to
            crawl: crawl identifier the resource was found in
            type: type of resource
            name: resource name
            headers: HTTP headers
            content: resource content
            created: creation timestamp
            modified: last modification timestamp
            status: HTTP status code
            size: size in bytes
            time: response time in milliseconds
            thumbnail: base64 encoded thumbnail (experimental)
            metadata: additional metadata for the resource
        """
        self.id = id
        self.url = url
        self.site = site
        self.crawl = crawl
        self.type = type
        self.name = name
        self.headers = headers
        self.content = content
        self.created = created
        self.modified = modified
        self.status = status
        self.size = size  # in bytes
        self.time = time  # in millis
        self.metadata = metadata  # reserved

        # set externally
        self.__extras: dict[str, str] = {}

    def to_dict(self) -> dict[str, METADATA_VALUE_TYPE]:
        """
        Convert the object to a dictionary suitable for JSON serialization.
        """
        result: dict[str, METADATA_VALUE_TYPE] = {
            "id": self.id,
            "url": self.url,
            "site": self.site,
            "crawl": self.crawl,
            "type": self.type.value if self.type else None,
            "name": self.name,
            "headers": self.headers,
            "content": self.content,
            "created": to_isoformat_zulu(self.created) if self.created else None,
            "modified": to_isoformat_zulu(self.modified) if self.modified else None,
            "status": self.status,
            "size": self.size,
            "time": self.time,
            "metadata": self.metadata  # reserved
        }
        if self.__extras:
            result["extras"] = {k: v for k, v in self.__extras.items()}

        return {k: v for k, v in result.items() if v is not None and not (k == "metadata" and v == {})}

    def set_extra(self, extra_name: str, extra_value: str | None | list[str] | list[dict[str, str | int | float]]) -> None:
        assert extra_name in RESOURCE_EXTRAS_ALLOWED, f"Unexpected extra requested. {extra_name}"
        self.__extras[extra_name] = extra_value

    def get_extra(self, extra_name: str) -> str | None | list[str] | list[dict[str, str | int | float]]:
        assert extra_name in RESOURCE_EXTRAS_ALLOWED, f"Unexpected extra requested. {extra_name}"
        if extra_name in self.__extras:
            return self.__extras[extra_name]
        else:
            return None

```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/katana/crawler.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.crawlers.katana.crawler &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../../../_static/doctools.js?v=888ff710"></script>
      <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
          <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.katana.crawler</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.crawlers.katana.crawler</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="KatanaCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.crawler.KatanaCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">KatanaCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    A crawler implementation for HTTP text files.</span>
<span class="sd">    Provides functionality for accessing and searching web content from captured HTTP exchanges.</span>
<span class="sd">    &quot;&quot;&quot;</span>

<div class="viewcode-block" id="KatanaCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.crawler.KatanaCrawler.__init__">[docs]</a>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">        Initialize the HTTP text crawler with a data source directory.</span>

<span class="sd">        Args:</span>
<span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
<span class="sd">                subdirectories with HTTP text files</span>
<span class="sd">        &quot;&quot;&quot;</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
</div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/_static/js/versions.js:
--------------------------------------------------------------------------------

```javascript
const themeFlyoutDisplay = "hidden";
const themeVersionSelector = true;
const themeLanguageSelector = true;

if (themeFlyoutDisplay === "attached") {
  function renderLanguages(config) {
    if (!config.projects.translations.length) {
      return "";
    }

    // Insert the current language to the options on the selector
    let languages = config.projects.translations.concat(config.projects.current);
    languages = languages.sort((a, b) => a.language.name.localeCompare(b.language.name));

    const languagesHTML = `
      <dl>
        <dt>Languages</dt>
        ${languages
          .map(
            (translation) => `
        <dd ${translation.slug == config.projects.current.slug ? 'class="rtd-current-item"' : ""}>
          <a href="${translation.urls.documentation}">${translation.language.code}</a>
        </dd>
        `,
          )
          .join("\n")}
      </dl>
    `;
    return languagesHTML;
  }

  function renderVersions(config) {
    if (!config.versions.active.length) {
      return "";
    }
    const versionsHTML = `
      <dl>
        <dt>Versions</dt>
        ${config.versions.active
          .map(
            (version) => `
        <dd ${version.slug === config.versions.current.slug ? 'class="rtd-current-item"' : ""}>
          <a href="${version.urls.documentation}">${version.slug}</a>
        </dd>
        `,
          )
          .join("\n")}
      </dl>
    `;
    return versionsHTML;
  }

  function renderDownloads(config) {
    if (!Object.keys(config.versions.current.downloads).length) {
      return "";
    }
    const downloadsNameDisplay = {
      pdf: "PDF",
      epub: "Epub",
      htmlzip: "HTML",
    };

    const downloadsHTML = `
      <dl>
        <dt>Downloads</dt>
        ${Object.entries(config.versions.current.downloads)
          .map(
            ([name, url]) => `
          <dd>
            <a href="${url}">${downloadsNameDisplay[name]}</a>
          </dd>
        `,
          )
          .join("\n")}
      </dl>
    `;
    return downloadsHTML;
  }

  document.addEventListener("readthedocs-addons-data-ready", function (event) {
    const config = event.detail.data();

    const flyout = `
      <div class="rst-versions" data-toggle="rst-versions" role="note">
        <span class="rst-current-version" data-toggle="rst-current-version">
          <span class="fa fa-book"> Read the Docs</span>
          v: ${config.versions.current.slug}
          <span class="fa fa-caret-down"></span>
        </span>
        <div class="rst-other-versions">
          <div class="injected">
            ${renderLanguages(config)}
            ${renderVersions(config)}
            ${renderDownloads(config)}
            <dl>
              <dt>On Read the Docs</dt>
              <dd>
                <a href="${config.projects.current.urls.home}">Project Home</a>
              </dd>
              <dd>
                <a href="${config.projects.current.urls.builds}">Builds</a>
              </dd>
              <dd>
                <a href="${config.projects.current.urls.downloads}">Downloads</a>
              </dd>
            </dl>
            <dl>
              <dt>Search</dt>
              <dd>
                <form id="flyout-search-form">
                  <input
                    class="wy-form"
                    type="text"
                    name="q"
                    aria-label="Search docs"
                    placeholder="Search docs"
                    />
                </form>
              </dd>
            </dl>
            <hr />
            <small>
              <span>Hosted by <a href="https://about.readthedocs.org/?utm_source=&utm_content=flyout">Read the Docs</a></span>
            </small>
          </div>
        </div>
    `;

    // Inject the generated flyout into the body HTML element.
    document.body.insertAdjacentHTML("beforeend", flyout);

    // Trigger the Read the Docs Addons Search modal when clicking on the "Search docs" input from inside the flyout.
    document
      .querySelector("#flyout-search-form")
      .addEventListener("focusin", () => {
        const event = new CustomEvent("readthedocs-search-show");
        document.dispatchEvent(event);
      });
  })
}

if (themeLanguageSelector || themeVersionSelector) {
  function onSelectorSwitch(event) {
    const option = event.target.selectedIndex;
    const item = event.target.options[option];
    window.location.href = item.dataset.url;
  }

  document.addEventListener("readthedocs-addons-data-ready", function (event) {
    const config = event.detail.data();

    const versionSwitch = document.querySelector(
      "div.switch-menus > div.version-switch",
    );
    if (themeVersionSelector) {
      let versions = config.versions.active;
      if (config.versions.current.hidden || config.versions.current.type === "external") {
        versions.unshift(config.versions.current);
      }
      const versionSelect = `
    <select>
      ${versions
        .map(
          (version) => `
        <option
  value="${version.slug}"
  ${config.versions.current.slug === version.slug ? 'selected="selected"' : ""}
              data-url="${version.urls.documentation}">
              ${version.slug}
          </option>`,
        )
        .join("\n")}
    </select>
  `;

      versionSwitch.innerHTML = versionSelect;
      versionSwitch.firstElementChild.addEventListener("change", onSelectorSwitch);
    }

    const languageSwitch = document.querySelector(
      "div.switch-menus > div.language-switch",
    );

    if (themeLanguageSelector) {
      if (config.projects.translations.length) {
        // Add the current language to the options on the selector
        let languages = config.projects.translations.concat(
          config.projects.current,
        );
        languages = languages.sort((a, b) =>
          a.language.name.localeCompare(b.language.name),
        );

        const languageSelect = `
      <select>
        ${languages
          .map(
            (language) => `
              <option
                  value="${language.language.code}"
                  ${config.projects.current.slug === language.slug ? 'selected="selected"' : ""}
                  data-url="${language.urls.documentation}">
                  ${language.language.name}
              </option>`,
          )
          .join("\n")}
       </select>
    `;

        languageSwitch.innerHTML = languageSelect;
        languageSwitch.firstElementChild.addEventListener("change", onSelectorSwitch);
      }
      else {
        languageSwitch.remove();
      }
    }
  });
}

document.addEventListener("readthedocs-addons-data-ready", function (event) {
  // Trigger the Read the Docs Addons Search modal when clicking on "Search docs" input from the topnav.
  document
    .querySelector("[role='search'] input")
    .addEventListener("focusin", () => {
      const event = new CustomEvent("readthedocs-search-show");
      document.dispatchEvent(event);
    });
});
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/main.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.main &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../_static/doctools.js?v=888ff710"></script>
      <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../genindex.html" />
    <link rel="search" title="Search" href="../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.main</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.main</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp.server.stdio</span> <span class="kn">import</span> <span class="n">stdio_server</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.crawler</span> <span class="kn">import</span> <span class="n">BaseCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span><span class="p">,</span> <span class="n">initialize_logger</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.server</span> <span class="kn">import</span> <span class="n">initialize_mcp_server</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="main">
<a class="viewcode-back" href="../../mcp_server_webcrawl.html#mcp_server_webcrawl.main.main">[docs]</a>
<span class="k">async</span> <span class="k">def</span> <span class="nf">main</span><span class="p">(</span><span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
    <span class="n">initialize_logger</span><span class="p">()</span>
    <span class="n">initialize_mcp_server</span><span class="p">()</span>
    <span class="k">async</span> <span class="k">with</span> <span class="n">stdio_server</span><span class="p">()</span> <span class="k">as</span> <span class="p">(</span><span class="n">read_stream</span><span class="p">,</span> <span class="n">write_stream</span><span class="p">):</span>
        <span class="n">crawler</span> <span class="o">=</span> <span class="n">crawler</span><span class="p">(</span><span class="n">datasrc</span><span class="p">)</span>
        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;MCP webcrawl server initialized with adapter </span><span class="si">{</span><span class="n">crawler</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;datasrc: </span><span class="si">{</span><span class="n">datasrc</span><span class="o">.</span><span class="n">absolute</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
        <span class="k">await</span> <span class="n">crawler</span><span class="o">.</span><span class="n">serve</span><span class="p">(</span><span class="n">read_stream</span><span class="p">,</span> <span class="n">write_stream</span><span class="p">)</span>
        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;MCP webcrawl server exited&quot;</span><span class="p">)</span></div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/installation.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Installation &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />


      <script src="_static/jquery.js?v=5d32c60e"></script>
      <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="_static/documentation_options.js?v=5929fcd5"></script>
      <script src="_static/doctools.js?v=888ff710"></script>
      <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Setup Guides" href="guides.html" />
    <link rel="prev" title="mcp-server-webcrawl" href="index.html" />
</head>

<body class="wy-body-for-nav">
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >



          <a href="index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="current reference internal" href="#">Installation</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#requirements">Requirements</a></li>
<li class="toctree-l2"><a class="reference internal" href="#mcp-configuration">MCP Configuration</a></li>
<li class="toctree-l2"><a class="reference internal" href="#multiple-configurations">Multiple Configurations</a></li>
<li class="toctree-l2"><a class="reference internal" href="#references">References</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Installation</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/installation.rst.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">

  <section id="installation">
<h1>Installation<a class="headerlink" href="#installation" title="Link to this heading"></a></h1>
<p>Install the package via pip:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>mcp-server-webcrawl
</pre></div>
</div>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>To use mcp-server-webcrawl effectively, you need:</p>
<ul class="simple">
<li><p>An MCP-capable LLM host such as Claude Desktop [1]</p></li>
<li><p>Python [2] installed on your command line interface</p></li>
<li><p>Basic familiarity with running Python packages</p></li>
</ul>
<p>After ensuring these prerequisites are met, run the pip install command above to add the package to your environment.</p>
</section>
<section id="mcp-configuration">
<h2>MCP Configuration<a class="headerlink" href="#mcp-configuration" title="Link to this heading"></a></h2>
<p>To enable your LLM host to access your web crawl data, you’ll need to add an MCP server configuration. From Claude’s developer settings, locate the MCP configuration section and add the appropriate configuration for your crawler type.</p>
<p>Setup guides and videos are available for each supported crawler:</p>
<ul class="simple">
<li><p><a class="reference internal" href="guides/archivebox.html"><span class="doc">ArchiveBox</span></a></p></li>
<li><p><a class="reference internal" href="guides/httrack.html"><span class="doc">HTTrack</span></a></p></li>
<li><p><a class="reference internal" href="guides/interrobot.html"><span class="doc">InterroBot</span></a></p></li>
<li><p><a class="reference internal" href="guides/katana.html"><span class="doc">Katana</span></a></p></li>
<li><p><a class="reference internal" href="guides/siteone.html"><span class="doc">SiteOne</span></a></p></li>
<li><p><a class="reference internal" href="guides/warc.html"><span class="doc">WARC</span></a></p></li>
<li><p><a class="reference internal" href="guides/wget.html"><span class="doc">Wget</span></a></p></li>
</ul>
</section>

<section id="references">
<h2>References<a class="headerlink" href="#references" title="Link to this heading"></a></h2>
<p>[1] Claude Desktop: <a class="reference external" href="https://claude.ai">https://claude.ai</a>
[2] Python: <a class="reference external" href="https://python.org">https://python.org</a></p>
</section>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="index.html" class="btn btn-neutral float-left" title="mcp-server-webcrawl" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="guides.html" class="btn btn-neutral float-right" title="Setup Guides" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.


</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/warc/crawler.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.crawlers.warc.crawler &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../../../_static/doctools.js?v=888ff710"></script>
      <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
          <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.warc.crawler</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.crawlers.warc.crawler</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="WarcCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.crawler.WarcCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">WarcCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    A crawler implementation for WARC (Web ARChive) files.</span>
<span class="sd">    Provides functionality for accessing and searching web archive content.</span>
<span class="sd">    &quot;&quot;&quot;</span>

<div class="viewcode-block" id="WarcCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.crawler.WarcCrawler.__init__">[docs]</a>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">        Initialize the WARC crawler with a data source directory.</span>
<span class="sd">        Supported file types: .txt, .warc, and .warc.gz</span>

<span class="sd">        Args:</span>
<span class="sd">            datasrc: the input argument as Path, must be a directory containing WARC files</span>


<span class="sd">        Raises:</span>
<span class="sd">            AssertionError: If datasrc is None or not a directory</span>
<span class="sd">        &quot;&quot;&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;WarcCrawler needs a datasrc, regardless of action&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;WarcCrawler datasrc must be a directory&quot;</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
</div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/wget/crawler.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.crawlers.wget.crawler &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../../../_static/doctools.js?v=888ff710"></script>
      <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
          <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.wget.crawler</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.crawlers.wget.crawler</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="WgetCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.crawler.WgetCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">WgetCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    A crawler implementation for wget captured sites.</span>
<span class="sd">    Provides functionality for accessing and searching web content from wget captures.</span>
<span class="sd">    &quot;&quot;&quot;</span>

<div class="viewcode-block" id="WgetCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.crawler.WgetCrawler.__init__">[docs]</a>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">        Initialize the wget crawler with a data source directory.</span>

<span class="sd">        Args:</span>
<span class="sd">            datasrc: the input argument as Path, it must be a directory containing</span>
<span class="sd">                wget captures organized as subdirectories</span>

<span class="sd">        Raises:</span>
<span class="sd">            AssertionError: If datasrc is None or not a directory</span>
<span class="sd">        &quot;&quot;&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;WgetCrawler needs a datasrc, regardless of action&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;WgetCrawler datasrc must be a directory&quot;</span>

        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
</div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/interactive.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Interactive Mode &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />

  
      <script src="_static/jquery.js?v=5d32c60e"></script>
      <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="_static/documentation_options.js?v=5929fcd5"></script>
      <script src="_static/doctools.js?v=888ff710"></script>
      <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="mcp_server_webcrawl" href="modules.html" />
    <link rel="prev" title="Prompt Routines" href="prompts.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Interactive Mode</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#usage">Usage</a></li>
<li class="toctree-l2"><a class="reference internal" href="#screencaps">Screencaps</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Interactive Mode</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/interactive.rst.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <section id="interactive-mode">
<h1>Interactive Mode<a class="headerlink" href="#interactive-mode" title="Link to this heading"></a></h1>
<p><strong>No AI, just classic Boolean search of your web-archives in a terminal.</strong></p>
<p>mcp-server-webcrawl can double as a terminal search for your web archives. You can run it against your local archives, but it gets more interesting when you realize you can ssh into any remote host and view archives sitting on that host. No downloads, syncs, multifactor logins, or other common drudgery required. With interactive mode, you can be in and searching a crawl sitting on a remote server in no time at all.</p>
<iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/8kNkP-zNzs4" frameborder="0" allowfullscreen></iframe><section id="usage">
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading"></a></h2>
<p>Interactive mode exposes the mcp-server-webcrawl search layer as a terminal UI (TUI), bypassing MCP/AI altogether. Core field and Boolean search are supported, along with the human-friendly aspects of the search interface, such as result snippets.</p>
<p>You launch interactive mode from the termial, using the –interactive command line argument.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>mcp-server-webcrawl<span class="w"> </span>--crawler<span class="w"> </span>wget<span class="w"> </span>--datasrc<span class="w"> </span>/path/to/datasrc<span class="w"> </span>--interactive
<span class="c1"># or manually enter crawler and datasrc in the UI</span>
mcp-server-webcrawl<span class="w"> </span>--interactive
</pre></div>
</div>
</section>
<section id="screencaps">
<h2>Screencaps<a class="headerlink" href="#screencaps" title="Link to this heading"></a></h2>
<figure class="align-center" id="id1">
<a class="reference internal image-reference" href="_images/interactive.search.webp"><img alt="mcp-server-webcrawl in --interactive mode heading" src="_images/interactive.search.webp" style="width: 100%;" /></a>
<figcaption>
<p><span class="caption-text">Search view, showing snippets with “Solar Eclipse” highlights</span><a class="headerlink" href="#id1" title="Link to this image"></a></p>
</figcaption>
</figure>
<figure class="align-center" id="id2">
<a class="reference internal image-reference" href="_images/interactive.document.webp"><img alt="mcp-server-webcrawl in --interactive mode heading" src="_images/interactive.document.webp" style="width: 100%;" /></a>
<figcaption>
<p><span class="caption-text">Document presentated in in Markdown, with raw and HTTP headers views available.</span><a class="headerlink" href="#id2" title="Link to this image"></a></p>
</figcaption>
</figure>
</section>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="prompts.html" class="btn btn-neutral float-left" title="Prompt Routines" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="modules.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/prompts/audit404.md:
--------------------------------------------------------------------------------

```markdown
# Webcrawl 404 Audit Instructions

## Query Sequence

### 1. Identify Target Domain & Homepage

**FIRST:** Get available sites and let user choose:
```
webcrawl_sites() - get all available domains
```

**THEN:** Find homepage with sorted URL approach:
```
query: type: html AND url: [target_site_domain]
limit: 1
sites: [target_site_id]
sort: +url
```

**NEXT:** Extract the exact domain (e.g. `example.com`) from the homepage URL. You will use this domain string in all subsequent queries to filter results to on-site pages, and using Boolean logic (NOT), to extract "all other 404s" separately.

### 2. Get Segmented 404s

All on-site 404s:
```
query: status:404 AND url: example.com
limit: 100
sites: [target_site_id]
```

All off-site 404s (outlinks, generally):
```
query: status:404 AND NOT url: example.com
limit: 100
sites: [target_site_id]
```

Note the total count from results metadata to understand scale. **If 100+ errors**, run additional queries prioritizing onsite 404s with offset: 0, 100, 200, 300... until all are captured or you gather 400 total results. Ask the user for permission for more if you think it'd be helpful and there is an end in sight.

### 3. Group URLs by Domain/Subdomain Patterns
- Identify main domain vs subdomains (e.g., `example.com` vs `corp.example.com`)
- Check for legacy HTTP domains vs HTTPS
- Count occurrences of each domain type

### 4. Identify Structural Patterns
Look for these common failure types:

**Pagination Issues:**
- URLs containing `page=`, `search_page=`, `/p/`, `offset=`
- Others, you will know when you see them
- Usually indicates pagination system generating invalid page numbers

**API Endpoint Failures:**
- URLs with `/api/`, `/wp-json/`, `/rest/`, `/oembed/`
- Others, you will know when you see them
- Often configuration or authentication issues

**Legacy Infrastructure:**
- HTTP vs HTTPS mismatches
- Old directory structures no longer supported
- Retired subdomains or CDN endpoints

**Media/Asset Problems:**
- File extensions (.m4r, .pdf, .jpg, .mp4)
- `/multimedia/`, `/images/`, `/downloads/` paths
- Missing files from content migrations

**Content Management Issues:**
- Similar path structures suggesting bulk content moves
- Deleted pages without proper redirects
- URL structure changes without migration planning

### 5. Calculate Pattern Distribution
- Count URLs in each pattern category
- Calculate percentage of total 404s for each theme
- Identify the dominant failure mode (usually 50%+ of errors)

### 6. Offer Advanced Analysis or Tool Research

After completing the main audit report, offer the user two additional options:
- **Detailed Analysis:** More comprehensive investigation of specific 404 patterns or high-impact broken pages
- **Tool Research:** Research and recommend specific tools to address identified 404 problems and implement monitoring

## Pattern Analysis Method

## Reporting Template

### 📊 Summary Metrics

| Metric | Value | Grade Threshold |
|--------|-------|----------------|
| **Total 404s** | X out of Y pages | A: <0.5% \| B: 0.5-1% \| C: 1-2% \| D: 2-3% \| F: >3% |
| **Error Rate** | Z% | [Calculated Grade] |
| **Site Health** | [Assessment] | Based on error distribution |

### 🔍 Pattern Distribution Analysis

| Pattern Type | Count | % of Total | Priority | Root Cause | Recommended Fix |
|--------------|-------|------------|----------|------------|-----------------|
| [Pattern Name] | X | Y% | Critical/High/Medium/Low | [Technical explanation] | [Specific action] |
| [Pattern Name] | X | Y% | Critical/High/Medium/Low | [Technical explanation] | [Specific action] |
| [Pattern Name] | X | Y% | Critical/High/Medium/Low | [Technical explanation] | [Specific action] |

### 🔧 Technical Impact Assessment

| Domain/Subdomain | 404 Count | Error Type | Business Impact | Fix Complexity |
|------------------|-----------|------------|-----------------|----------------|
| [main_domain] | X | [Pattern] | [SEO/UX/Revenue] | [Simple/Complex] |
| [subdomain] | X | [Pattern] | [SEO/UX/Revenue] | [Simple/Complex] |
| [external] | X | [Pattern] | [SEO/UX/Revenue] | [Simple/Complex] |

### ⚡ Impact Priority Assessment

| Priority Level | Criteria | Example Issues |
|----------------|----------|----------------|
| **🚨 Critical** | Core functionality, revenue impact | Payment pages, login systems |
| **🔴 High** | Major SEO/UX degradation | Product pages, main navigation |
| **🟡 Medium** | Internal links, historical content | Blog archives, old campaigns |
| **🟢 Low** | Edge cases, rarely accessed | Test pages, admin tools |

### 🎯 Quick Win Opportunities

| Fix Type | Effort Level | Impact | Implementation Method |
|----------|--------------|--------|----------------------|
| **Simple redirects** | Low | High | 301 redirects for obvious replacements |
| **HTTPS upgrades** | Low | Medium | Automatic HTTP→HTTPS redirect rules |
| **Config fixes** | Medium | High | Server/CDN configuration updates |
| **Asset cleanup** | Medium | Medium | Remove/replace broken media references |

### 🛠️ Solution Stack Reference

#### Monitoring & Detection Tools

| Tool Category | Recommended Solution | Use Case | Integration Complexity |
|---------------|---------------------|----------|----------------------|
| **Search Monitoring** | Google Search Console | Track SERP 404s, set alerts | Simple |
| **Site Crawling** | Screaming Frog SEO Spider | Comprehensive link analysis | Medium |
| **Automated Monitoring** | Dead Link Checker, Pingdom | Ongoing 404 detection | Medium |
| **Log Analysis** | GoAccess, AWStats | Server-level 404 pattern analysis | Complex |

#### Redirect Management Options

| Platform | Tool | Strengths | Best For |
|----------|------|-----------|----------|
| **WordPress** | Redirection Plugin | User-friendly interface | Content sites |
| **CDN Level** | Cloudflare Page Rules | Global, cached redirects | High-traffic sites |
| **Server Level** | Nginx/Apache rewrites | Maximum performance | Technical teams |
| **Bulk Operations** | CSV redirect generators | Mass URL migrations | Large site moves |

## What's Next?

The audit results give you a clear picture of what you're dealing with - whether it's a few simple redirects, a pattern of broken external links, or something more complex like a pagination system gone wrong. Most 404 issues fall into predictable patterns that have standard solutions.

**Ready to dive deeper?** I can help you:
- **Create detailed fix strategies** - Let's prioritize your specific 404 patterns and map out exactly how to address them, including timeline recommendations and implementation approaches
- **Expand the analysis** - Examine more URLs, analyze referrer patterns to see how users find these broken links, or investigate when the breaks started happening
- **Research implementation tools** - Find the right redirect management, monitoring, or automated testing solutions that fit your technical stack and team workflow

**What would be most helpful for your next steps?**

## Methodology

You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.

Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/siteone/crawler.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.crawlers.siteone.crawler &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../../../_static/doctools.js?v=888ff710"></script>
      <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
          <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.siteone.crawler</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.crawlers.siteone.crawler</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.siteone.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="SiteOneCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.siteone.html#mcp_server_webcrawl.crawlers.siteone.crawler.SiteOneCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">SiteOneCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    A crawler implementation for SiteOne captured sites.</span>
<span class="sd">    Provides functionality for accessing and searching web content from SiteOne captures.</span>
<span class="sd">    SiteOne merges a wget archive with a custom SiteOne generated log to aquire more</span>
<span class="sd">    fields than wget can alone.</span>
<span class="sd">    &quot;&quot;&quot;</span>

<div class="viewcode-block" id="SiteOneCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.siteone.html#mcp_server_webcrawl.crawlers.siteone.crawler.SiteOneCrawler.__init__">[docs]</a>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">        Initialize the SiteOne crawler with a data source directory.</span>

<span class="sd">        Args:</span>
<span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
<span class="sd">                SiteOne captures organized as subdirectories</span>

<span class="sd">        Raises:</span>
<span class="sd">            AssertionError: If datasrc is None or not a directory</span>
<span class="sd">        &quot;&quot;&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;SiteOneCrawler needs a datasrc, regardless of action&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;SiteOneCrawler datasrc must be a directory&quot;</span>

        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
</div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/wget/adapter.py:
--------------------------------------------------------------------------------

```python
import os
import sqlite3
import traceback
import re

from datetime import timezone
from contextlib import closing
from datetime import datetime
from pathlib import Path

from mcp_server_webcrawl.crawlers.base.adapter import (
    BaseManager,
    IndexState,
    IndexStatus,
    SitesGroup,
    INDEXED_BATCH_SIZE,
    INDEXED_RESOURCE_DEFAULT_PROTOCOL,
    INDEXED_TYPE_MAPPING,
    INDEXED_IGNORE_DIRECTORIES,
)
from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
from mcp_server_webcrawl.models.resources import (
    ResourceResult,
    ResourceResultType,
    RESOURCES_LIMIT_DEFAULT,
)
from mcp_server_webcrawl.models.sites import (
    SiteResult,
)
from mcp_server_webcrawl.utils.logger import get_logger

logger = get_logger()


class WgetManager(IndexedManager):
    """
    Manages wget directory data in in-memory SQLite databases.
    Provides connection pooling and caching for efficient access.
    """

    def __init__(self) -> None:
        """Initialize the wget manager with empty cache and statistics."""
        super().__init__()

    def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
        site_id: int, index_state: IndexState = None) -> None:
        """
        Load a wget directory into the database with parallel processing and batch SQL insertions.

        Args:
            connection: SQLite connection
            directory: path to the wget directory
            site_id: id for the site
            index_state: indexState object for tracking progress
        """
        if not directory.exists() or not directory.is_dir():
            logger.error(f"Directory not found or not a directory: {directory}")
            return

        if index_state is not None:
            index_state.set_status(IndexStatus.INDEXING)

        # collect files to process
        file_paths = []
        for root, _, files in os.walk(directory):
            for filename in files:
                if filename == "robots.txt":
                    continue

                rel_path = Path(root).relative_to(directory)
                ignore_file = False
                for ignore_dir in INDEXED_IGNORE_DIRECTORIES:
                    if ignore_dir in str(rel_path):
                        ignore_file = True
                        break

                if not ignore_file:
                    file_paths.append(Path(root) / filename)

        # each crawler a litle different
        with closing(connection.cursor()) as cursor:
            for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
                if index_state is not None and index_state.is_timeout():
                    index_state.set_status(IndexStatus.PARTIAL)
                    return

                batch_file_paths: list[Path] = file_paths[i:i+INDEXED_BATCH_SIZE]
                batch_file_contents = BaseManager.read_files(batch_file_paths)
                batch_insert_resource_results: list[ResourceResult] = []
                for file_path, content in batch_file_contents.items():
                    try:
                        result: ResourceResult = self._prepare_wget_record(file_path, site_id, directory, content)
                        if result:
                            batch_insert_resource_results.append(result)
                            if index_state is not None:
                                index_state.increment_processed()
                    except Exception as ex:
                        logger.error(f"Error processing file {file_path}: {ex}\n{traceback.format_exc()}")

                self._execute_batch_insert(connection, cursor, batch_insert_resource_results)

            if index_state is not None and index_state.status == IndexStatus.INDEXING:
                index_state.set_status(IndexStatus.COMPLETE)

    def _prepare_wget_record(self, file_path: Path, site_id: int, base_dir: Path, content: str = None) -> ResourceResult | None:
        """
        Prepare a record for batch insertion from a wget file.

        Args:
            file_path: path to the wget file
            site_id: id for the site
            base_dir: base directory for the wget capture
            content: optional pre-loaded file content

        Returns:
            Tuple of values ready for insertion, or None if processing fails
        """
        try:
            relative_path = file_path.relative_to(base_dir)
            url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{base_dir.name}/{str(relative_path).replace(os.sep, '/')}"

            # wget is creating ./index.html from ./ in most cases. eliminate it to preserve homepage sort
            # which is way more important than the (wget manufactured) filename reference
            url = re.sub(r"/index\.html($|\?)", r"/\1", url)

            decruftified_path = BaseManager.decruft_path(str(file_path))
            extension = Path(decruftified_path).suffix.lower()
            resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
            file_stat = file_path.stat()
            file_size = file_stat.st_size
            file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
            file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)

            # use pre-loaded content if available, otherwise rely on read_file_contents
            file_content = content
            if file_content is None:
                file_content = BaseManager.read_file_contents(file_path, resource_type)

            return ResourceResult(
                id=BaseManager.string_to_id(url),
                site=site_id,
                created=file_created,
                modified=file_modified,
                url=url,
                type=resource_type,
                status=200,
                headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
                content=file_content,
                size=file_size,
                time=0,
            )
        except Exception as ex:
            logger.error(f"Error preparing record for file {file_path}: {ex}")
            return None


manager: WgetManager = WgetManager()

def get_sites(
        datasrc: Path,
        ids: list[int] | None = None,
        fields: list[str] | None = None
    ) -> list[SiteResult]:
    """
    List site directories in the datasrc directory as sites.

    Args:
        datasrc: path to the directory containing site subdirectories
        ids: optional list of site IDs to filter by
        fields: optional list of fields to include in the response

    Returns:
        List of SiteResult objects, one for each site directory

    Notes:
        Returns an empty list if the datasrc directory doesn't exist.
    """
    return manager.get_sites_for_directories(datasrc, ids, fields)

def get_resources(
    datasrc: Path,
    sites: list[int] | None = None,
    query: str = "",
    fields: list[str] | None = None,
    sort: str | None = None,
    limit: int = RESOURCES_LIMIT_DEFAULT,
    offset: int = 0,

) -> tuple[list[ResourceResult], int, IndexState]:
    """
    Get resources from wget directories using in-memory SQLite.

    Args:
        datasrc: path to the directory containing wget captures
        sites: optional list of site IDs to filter by
        query: search query string
        fields: optional list of fields to include in response
        sort: sort order for results
        limit: maximum number of results to return
        offset: number of results to skip for pagination

    Returns:
        Tuple of (list of ResourceResult objects, total count)
    """
    sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
    assert sites_results, "At least one site is required to search"
    site_paths = [site.path for site in sites_results]
    sites_group = SitesGroup(datasrc, sites, site_paths)
    return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)

```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.utils &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../_static/doctools.js?v=888ff710"></script>
      <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../genindex.html" />
    <link rel="search" title="Search" href="../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.utils</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.utils</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">re</span>

<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>

<div class="viewcode-block" id="to_isoformat_zulu">
<a class="viewcode-back" href="../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.to_isoformat_zulu">[docs]</a>
<span class="k">def</span> <span class="nf">to_isoformat_zulu</span><span class="p">(</span><span class="n">dt</span><span class="p">:</span> <span class="n">datetime</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    Convert datetime to iso Z.</span>

<span class="sd">    python&lt;=3.10 struggles with Z and fractions of seconds, will</span>
<span class="sd">    throw. smooth out the iso string, second precision isn&#39;t key here</span>
<span class="sd">    &quot;&quot;&quot;</span>
    <span class="k">return</span> <span class="n">dt</span><span class="o">.</span><span class="n">isoformat</span><span class="p">()</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;+00:00&quot;</span><span class="p">,</span> <span class="s2">&quot;Z&quot;</span><span class="p">)</span></div>


<div class="viewcode-block" id="from_isoformat_zulu">
<a class="viewcode-back" href="../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.from_isoformat_zulu">[docs]</a>
<span class="k">def</span> <span class="nf">from_isoformat_zulu</span><span class="p">(</span><span class="n">dt_string</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">datetime</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    Convert ISO string to datetime.</span>

<span class="sd">    python&lt;=3.10 struggles with Z and fractions of seconds, will</span>
<span class="sd">    throw. smooth out the iso string, second precision isn&#39;t key here</span>
<span class="sd">    &quot;&quot;&quot;</span>

    <span class="k">if</span> <span class="ow">not</span> <span class="n">dt_string</span><span class="p">:</span>
        <span class="k">return</span> <span class="kc">None</span>
    <span class="n">dt_string</span> <span class="o">=</span> <span class="n">dt_string</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;Z&quot;</span><span class="p">,</span> <span class="s2">&quot;+00:00&quot;</span><span class="p">)</span>
    <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;(.*\.\d</span><span class="si">{6}</span><span class="s2">)\d*([-+]\d</span><span class="si">{2}</span><span class="s2">:\d</span><span class="si">{2}</span><span class="s2">|$)&quot;</span><span class="p">,</span> <span class="n">dt_string</span><span class="p">)</span>
    <span class="k">if</span> <span class="n">match</span><span class="p">:</span>
        <span class="n">dt_string</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="ow">or</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
    <span class="k">return</span> <span class="n">datetime</span><span class="o">.</span><span class="n">fromisoformat</span><span class="p">(</span><span class="n">dt_string</span><span class="p">)</span></div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/httrack/crawler.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.crawlers.httrack.crawler &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../../../_static/doctools.js?v=888ff710"></script>
      <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
          <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.httrack.crawler</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.crawlers.httrack.crawler</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.httrack.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="HtTrackCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.httrack.html#mcp_server_webcrawl.crawlers.httrack.crawler.HtTrackCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">HtTrackCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    A crawler implementation for HTTrack captured sites.</span>
<span class="sd">    Provides functionality for accessing and searching web content from HTTrack projects.</span>
<span class="sd">    HTTrack creates offline mirrors of websites with preserved directory structure</span>
<span class="sd">    and metadata in hts-log.txt files.</span>
<span class="sd">    &quot;&quot;&quot;</span>

<div class="viewcode-block" id="HtTrackCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.httrack.html#mcp_server_webcrawl.crawlers.httrack.crawler.HtTrackCrawler.__init__">[docs]</a>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">        Initialize the HTTrack crawler with a data source directory.</span>

<span class="sd">        Args:</span>
<span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
<span class="sd">                HTTrack project directories, each potentially containing multiple domains</span>

<span class="sd">        Raises:</span>
<span class="sd">            AssertionError: If datasrc is None or not a directory</span>
<span class="sd">        &quot;&quot;&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;HtTrackCrawler needs a datasrc, regardless of action&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;HtTrackCrawler datasrc must be a directory&quot;</span>

        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
</div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/prompts.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Prompt Routines &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />

  
      <script src="_static/jquery.js?v=5d32c60e"></script>
      <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="_static/documentation_options.js?v=5929fcd5"></script>
      <script src="_static/doctools.js?v=888ff710"></script>
      <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="mcp_server_webcrawl" href="modules.html" />
    <link rel="prev" title="Usage" href="usage.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Prompt Routines</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/prompts.rst.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <section id="prompt-routines">
<h1>Prompt Routines<a class="headerlink" href="#prompt-routines" title="Link to this heading"></a></h1>
<p><strong>mcp-server-webcrawl</strong> provides the toolkit necessary to search web crawl data freestyle, figuring it out as you go, reacting to each query. This is what it was designed for.</p>
<p>It is also capable of running routines (as prompts). You can write these yourself, or use the ones provided. These prompts are <strong>copy and paste</strong>, and used as raw Markdown. They are enabled by the advanced search provided to the LLM; queries and logic can be embedded in a procedural set of instructions, or even an input loop as is the case with Gopher Service.</p>
<p>If you want to shortcut the site selection (one less query), paste the Markdown and in the same request, type “run pasted for [site name or URL].” It will figure it out. When pasted without additional context, you will be prompted to select a site (if no site is in context).</p>
<table class="docutils align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Prompt</p></th>
<th class="head"><p>Download</p></th>
<th class="head"><p>Category</p></th>
<th class="head"><p>Description</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>🔍 <strong>SEO Audit</strong></p></td>
<td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditseo.md">auditseo.md</a></p></td>
<td><p>audit</p></td>
<td><p>Technical SEO (search engine optimization) analysis. Covers the
basics, with options to dive deeper.</p></td>
</tr>
<tr class="row-odd"><td><p>🔗 <strong>404 Audit</strong></p></td>
<td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/audit404.md">audit404.md</a></p></td>
<td><p>audit</p></td>
<td><p>Broken link detection and pattern analysis. Not only finds issues,
but suggests fixes.</p></td>
</tr>
<tr class="row-even"><td><p>⚡ <strong>Performance Audit</strong></p></td>
<td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditperf.md">auditperf.md</a></p></td>
<td><p>audit</p></td>
<td><p>Website speed and optimization analysis. Real talk.</p></td>
</tr>
<tr class="row-odd"><td><p>📁 <strong>File Audit</strong></p></td>
<td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/auditfiles.md">auditfiles.md</a></p></td>
<td><p>audit</p></td>
<td><p>File organization and asset analysis. Discover the composition of
your website.</p></td>
</tr>
<tr class="row-even"><td><p>🌐 <strong>Gopher Interface</strong></p></td>
<td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/gopher.md">gopher.md</a></p></td>
<td><p>interface</p></td>
<td><p>An old-fashioned search interface inspired by the Gopher clients of
yesteryear.</p></td>
</tr>
<tr class="row-odd"><td><p>⚙️ <strong>Search Test</strong></p></td>
<td><p><a class="reference external" href="https://raw.githubusercontent.com/pragmar/mcp-server-webcrawl/master/prompts/testsearch.md">testsearch.md</a></p></td>
<td><p>self-test</p></td>
<td><p>A battery of tests to check for Boolean logical inconsistencies in
the search query parser and subsequent FTS5 conversion.</p></td>
</tr>
</tbody>
</table>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="usage.html" class="btn btn-neutral float-left" title="Usage" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="modules.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/archivebox/crawler.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.crawlers.archivebox.crawler &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />

  
      <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
      <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../../../../_static/doctools.js?v=888ff710"></script>
      <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../../../../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../../../../genindex.html" />
    <link rel="search" title="Search" href="../../../../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../../../../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../../../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
          <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.archivebox.crawler</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>Source code for mcp_server_webcrawl.crawlers.archivebox.crawler</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>

<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.indexed</span> <span class="kn">import</span> <span class="n">IndexedCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.archivebox.adapter</span> <span class="kn">import</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>

<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>

<div class="viewcode-block" id="ArchiveBoxCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.archivebox.html#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">ArchiveBoxCrawler</span><span class="p">(</span><span class="n">IndexedCrawler</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    A crawler implementation for ArchiveBox archived sites.</span>
<span class="sd">    Provides functionality for accessing and searching web content from ArchiveBox archives.</span>
<span class="sd">    ArchiveBox creates single-URL archives with metadata stored in JSON files</span>
<span class="sd">    and HTML content preserved in index.html files.</span>
<span class="sd">    &quot;&quot;&quot;</span>

<div class="viewcode-block" id="ArchiveBoxCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.archivebox.html#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler.__init__">[docs]</a>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">        Initialize the ArchiveBox crawler with a data source directory.</span>

<span class="sd">        Args:</span>
<span class="sd">            datasrc: The input argument as Path, it must be a directory containing</span>
<span class="sd">                ArchiveBox archive directories, each containing individual URL entries</span>

<span class="sd">        Raises:</span>
<span class="sd">            AssertionError: If datasrc is None or not a directory</span>
<span class="sd">        &quot;&quot;&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;ArchiveBoxCrawler needs a datasrc, regardless of action&quot;</span>
        <span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> <span class="s2">&quot;ArchiveBoxCrawler datasrc must be a directory&quot;</span>

        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">)</span></div>
</div>

</pre></div>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/katana/adapter.py:
--------------------------------------------------------------------------------

```python
import re
import sqlite3

from itertools import chain
from contextlib import closing
from pathlib import Path

from datetime import datetime, timezone

from mcp_server_webcrawl.crawlers.base.adapter import (
    IndexState,
    IndexStatus,
    BaseManager,
    SitesGroup,
    INDEXED_BATCH_SIZE,
)
from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
from mcp_server_webcrawl.models.resources import (
    ResourceResult,
    ResourceResultType,
    RESOURCES_LIMIT_DEFAULT,
)
from mcp_server_webcrawl.models.sites import (
    SiteResult,
)
from mcp_server_webcrawl.utils.logger import get_logger

logger = get_logger()

KATANA_REGEX_HTTP_STATUS = re.compile(r"HTTP/\d\.\d\s+(\d+)")
KATANA_REGEX_CONTENT_TYPE = re.compile(r"Content-Type:\s*([^\r\n;]+)", re.IGNORECASE)

class KatanaManager(IndexedManager):
    """
    Manages HTTP text files in in-memory SQLite databases.
    Provides connection pooling and caching for efficient access.
    """

    def __init__(self) -> None:
        """Initialize the HTTP text manager with empty cache and statistics."""
        super().__init__()

    def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
            site_id: int, index_state: IndexState = None) -> None:
        """
        Load a site directory of HTTP text files into the database with parallel reading
        and batch SQL insertions.

        Args:
            connection: SQLite connection
            directory: path to the site directory
            site_id: ID for the site
            index_state: tracker for FTS indexing status
        """

        if not directory.exists() or not directory.is_dir():
            logger.error(f"Directory not found or not a directory: {directory}")
            return

        if index_state is not None:
            index_state.set_status(IndexStatus.INDEXING)

        file_paths = list(chain(
            directory.glob("*.txt"),
            directory.glob("*/*.txt")  # katana stores offsite assets under hostname
        ))

        with closing(connection.cursor()) as cursor:
            for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
                if index_state is not None and index_state.is_timeout():
                    index_state.set_status(IndexStatus.PARTIAL)
                    return

                batch_file_paths: list[Path] = file_paths[i:i+INDEXED_BATCH_SIZE]
                batch_file_contents = BaseManager.read_files(batch_file_paths)
                batch_insert_resource_results: list[ResourceResult] = []
                for file_path, content in batch_file_contents.items():
                    # avoid readme in repo, katana crawl files should be named 9080ef8...
                    if file_path.name.lower().endswith("readme.txt"):
                        continue
                    try:
                        record = self._prepare_katana_record(file_path, site_id, content)
                        if record:
                            batch_insert_resource_results.append(record)
                            if index_state is not None:
                                index_state.increment_processed()
                    except Exception as ex:
                        logger.error(f"Error processing file {file_path}: {ex}")

                self._execute_batch_insert(connection, cursor, batch_insert_resource_results)

            if index_state is not None and index_state.status == IndexStatus.INDEXING:
                index_state.set_status(IndexStatus.COMPLETE)

    def _prepare_katana_record(self, file_path: Path, site_id: int, content: str) -> ResourceResult | None:
        """
        Prepare a record for batch insertion.

        Args:
            file_path: path to the Katana crawl file record
            site_id: ID for the site
            content: loaded file content

        Returns:
            ResourceResult object ready for insertion, or None if processing fails
        """
        if file_path.is_file():
            file_stat = file_path.stat()
            # HTTP header modified mostly useless, change my mind
            file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
            file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
        else:
            file_created = None
            file_modified = None

        # crawl format: <url>\n\n<request>\n\n<headers>...<response>
        parts: list[str] = content.split("\n\n", 2)
        if len(parts) < 3:
            logger.warning(f"Invalid HTTP text format in file {file_path}")
            return None

        url: str = parts[0].strip()
        response_data: str = parts[2].strip()

        try:
            response_parts: list[str] = response_data.split("\n\n", 1)
            headers: str = response_parts[0].strip()
            body: str = response_parts[1].strip() if len(response_parts) > 1 else ""

            if "Transfer-Encoding: chunked" in headers:
                body = body.split("\n", 1)[1].strip()   # remove hex prefix
                body = body.rsplit("\n0", 1)[0].strip() # remove trailing "0" terminator

            # status from the first line of headers
            status_match: str = KATANA_REGEX_HTTP_STATUS.search(headers.split("\n", 2)[0])
            status_code: int = int(status_match.group(1)) if status_match else 0

            content_type_match = KATANA_REGEX_CONTENT_TYPE.search(headers)
            content_type = content_type_match.group(1).strip() if content_type_match else ""
            resource_type = self._determine_resource_type(content_type)
            content_size = len(body)
            resource_id = BaseManager.string_to_id(url)

            return ResourceResult(
                id=resource_id,
                site=site_id,
                created=file_created,
                modified=file_modified,
                url=url,
                type=resource_type,
                headers=headers,
                content=body if self._is_text_content(content_type) else None,
                status=status_code,
                size=content_size,
                time=0  # time not available in file or Katana index
            )

        except Exception as ex:
            logger.error(f"Error processing HTTP response in file {file_path}: {ex}")
            return None

manager: KatanaManager = KatanaManager()

def get_sites(
        datasrc: Path,
        ids: list[int] | None = None,
        fields: list[str] | None = None
    ) -> list[SiteResult]:
    """
    List site directories in the datasrc directory as sites.

    Args:
        datasrc: path to the directory containing site subdirectories
        ids: optional list of site IDs to filter by
        fields: optional list of fields to include in the response

    Returns:
        List of SiteResult objects, one for each site directory

    Notes:
        Returns an empty list if the datasrc directory doesn't exist.
    """
    return manager.get_sites_for_directories(datasrc, ids, fields)

def get_resources(
    datasrc: Path,
    ids: list[int] | None = None,
    sites: list[int] | None = None,
    query: str = "",
    types: list[ResourceResultType] | None = None,
    fields: list[str] | None = None,
    statuses: list[int] | None = None,
    sort: str | None = None,
    limit: int = RESOURCES_LIMIT_DEFAULT,
    offset: int = 0,
) -> tuple[list[ResourceResult], int, IndexState]:
    """
    Get resources from wget directories using in-memory SQLite.

    Args:
        datasrc: path to the directory containing wget captures
        ids: optional list of resource IDs to filter by
        sites: optional list of site IDs to filter by
        query: search query string
        types: optional list of resource types to filter by
        fields: optional list of fields to include in response
        statuses: optional list of HTTP status codes to filter by
        sort: sort order for results
        limit: maximum number of results to return
        offset: number of results to skip for pagination

    Returns:
        Tuple of (list of ResourceResult objects, total count)
    """
    sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
    assert sites_results, "At least one site is required to search"
    site_paths = [site.path for site in sites_results]
    sites_group = SitesGroup(datasrc, sites, site_paths)
    return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)

```

--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.templates.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>mcp_server_webcrawl.templates package &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />

  
      <script src="_static/jquery.js?v=5d32c60e"></script>
      <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="_static/documentation_options.js?v=5929fcd5"></script>
      <script src="_static/doctools.js?v=888ff710"></script>
      <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="mcp_server_webcrawl.utils package" href="mcp_server_webcrawl.utils.html" />
    <link rel="prev" title="mcp_server_webcrawl.models package" href="mcp_server_webcrawl.models.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
          <li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
          <li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
      <li class="breadcrumb-item active">mcp_server_webcrawl.templates package</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/mcp_server_webcrawl.templates.rst.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <section id="mcp-server-webcrawl-templates-package">
<h1>mcp_server_webcrawl.templates package<a class="headerlink" href="#mcp-server-webcrawl-templates-package" title="Link to this heading"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl.templates.tests">
<span id="mcp-server-webcrawl-templates-tests-module"></span><h2>mcp_server_webcrawl.templates.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.templates.tests" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">TemplateTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">TestCase</span></code></p>
<p>Test suite for the custom HTML to markdown converter.
Why custom? It’s a bit faster, that is the only reason.
Maximum load is 100 transforms (1 per result for a max result
of 100), so speed matters. A default set is 20.
This converter does a few things differently to tailor to LLM
interaction.
* aggressively removes images (html2text selectively renders)
* links with block decendents will render like a &lt;p&gt;</p>
<blockquote>
<div><p>(html2text treats as &lt;a&gt;&lt;br&gt;)</p>
</div></blockquote>
<p>Create an instance of the class that will use the named test
method when executed. Raises a ValueError if the instance does
not have a method with the specified name.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests.setUp">
<span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests.setUp" title="Link to this definition"></a></dt>
<dd><p>Set up the test environment with fixture data.</p>
</dd></dl>

<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html">
<span class="sig-name descname"><span class="pre">test_core_html</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/templates/tests.html#TemplateTests.test_core_html"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html" title="Link to this definition"></a></dt>
<dd></dd></dl>

</dd></dl>

</section>
<section id="module-mcp_server_webcrawl.templates">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.templates" title="Link to this heading"></a></h2>
</section>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="mcp_server_webcrawl.models.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.models package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="mcp_server_webcrawl.utils.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.utils package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/docs/_modules/index.html:
--------------------------------------------------------------------------------

```html


<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Overview: module code &mdash; mcp-server-webcrawl  documentation</title>
      <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />

  
      <script src="../_static/jquery.js?v=5d32c60e"></script>
      <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../_static/doctools.js?v=888ff710"></script>
      <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >

          
          
          <a href="../index.html" class="icon icon-home">
            mcp-server-webcrawl
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../index.html">mcp-server-webcrawl</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Overview: module code</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <h1>All modules for which code is available</h1>
<ul><li><a href="mcp_server_webcrawl/crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<ul><li><a href="mcp_server_webcrawl/crawlers/archivebox/adapter.html">mcp_server_webcrawl.crawlers.archivebox.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/archivebox/crawler.html">mcp_server_webcrawl.crawlers.archivebox.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/archivebox/tests.html">mcp_server_webcrawl.crawlers.archivebox.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/base/adapter.html">mcp_server_webcrawl.crawlers.base.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/base/api.html">mcp_server_webcrawl.crawlers.base.api</a></li>
<li><a href="mcp_server_webcrawl/crawlers/base/crawler.html">mcp_server_webcrawl.crawlers.base.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/base/indexed.html">mcp_server_webcrawl.crawlers.base.indexed</a></li>
<li><a href="mcp_server_webcrawl/crawlers/base/tests.html">mcp_server_webcrawl.crawlers.base.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/httrack/adapter.html">mcp_server_webcrawl.crawlers.httrack.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/httrack/crawler.html">mcp_server_webcrawl.crawlers.httrack.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/httrack/tests.html">mcp_server_webcrawl.crawlers.httrack.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/interrobot/adapter.html">mcp_server_webcrawl.crawlers.interrobot.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/interrobot/crawler.html">mcp_server_webcrawl.crawlers.interrobot.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/interrobot/tests.html">mcp_server_webcrawl.crawlers.interrobot.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/katana/adapter.html">mcp_server_webcrawl.crawlers.katana.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/katana/crawler.html">mcp_server_webcrawl.crawlers.katana.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/katana/tests.html">mcp_server_webcrawl.crawlers.katana.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/siteone/adapter.html">mcp_server_webcrawl.crawlers.siteone.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/siteone/crawler.html">mcp_server_webcrawl.crawlers.siteone.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/siteone/tests.html">mcp_server_webcrawl.crawlers.siteone.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/warc/adapter.html">mcp_server_webcrawl.crawlers.warc.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/warc/crawler.html">mcp_server_webcrawl.crawlers.warc.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/warc/tests.html">mcp_server_webcrawl.crawlers.warc.tests</a></li>
<li><a href="mcp_server_webcrawl/crawlers/wget/adapter.html">mcp_server_webcrawl.crawlers.wget.adapter</a></li>
<li><a href="mcp_server_webcrawl/crawlers/wget/crawler.html">mcp_server_webcrawl.crawlers.wget.crawler</a></li>
<li><a href="mcp_server_webcrawl/crawlers/wget/tests.html">mcp_server_webcrawl.crawlers.wget.tests</a></li>
</ul><li><a href="mcp_server_webcrawl/extras/markdown.html">mcp_server_webcrawl.extras.markdown</a></li>
<li><a href="mcp_server_webcrawl/extras/regex.html">mcp_server_webcrawl.extras.regex</a></li>
<li><a href="mcp_server_webcrawl/extras/snippets.html">mcp_server_webcrawl.extras.snippets</a></li>
<li><a href="mcp_server_webcrawl/extras/thumbnails.html">mcp_server_webcrawl.extras.thumbnails</a></li>
<li><a href="mcp_server_webcrawl/extras/xpath.html">mcp_server_webcrawl.extras.xpath</a></li>
<li><a href="mcp_server_webcrawl/interactive/highlights.html">mcp_server_webcrawl.interactive.highlights</a></li>
<li><a href="mcp_server_webcrawl/interactive/search.html">mcp_server_webcrawl.interactive.search</a></li>
<li><a href="mcp_server_webcrawl/interactive/session.html">mcp_server_webcrawl.interactive.session</a></li>
<li><a href="mcp_server_webcrawl/interactive/ui.html">mcp_server_webcrawl.interactive.ui</a></li>
<li><a href="mcp_server_webcrawl/main.html">mcp_server_webcrawl.main</a></li>
<li><a href="mcp_server_webcrawl/models/resources.html">mcp_server_webcrawl.models.resources</a></li>
<li><a href="mcp_server_webcrawl/models/sites.html">mcp_server_webcrawl.models.sites</a></li>
<li><a href="mcp_server_webcrawl/templates/tests.html">mcp_server_webcrawl.templates.tests</a></li>
<li><a href="mcp_server_webcrawl/utils.html">mcp_server_webcrawl.utils</a></li>
<ul><li><a href="mcp_server_webcrawl/utils/cli.html">mcp_server_webcrawl.utils.cli</a></li>
<li><a href="mcp_server_webcrawl/utils/logger.html">mcp_server_webcrawl.utils.logger</a></li>
<li><a href="mcp_server_webcrawl/utils/server.html">mcp_server_webcrawl.utils.server</a></li>
<li><a href="mcp_server_webcrawl/utils/tools.html">mcp_server_webcrawl.utils.tools</a></li>
</ul><li><a href="namedtuple_InputRadioState.html">namedtuple_InputRadioState</a></li>
</ul>

           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2025, pragmar.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

</body>
</html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/httrack/tests.py:
--------------------------------------------------------------------------------

```python
from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler
from mcp_server_webcrawl.crawlers.httrack.adapter import HtTrackManager
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger

logger = get_logger()

# Calculate using same hash function as adapter
EXAMPLE_SITE_ID = HtTrackManager.string_to_id("example")
PRAGMAR_SITE_ID = HtTrackManager.string_to_id("pragmar")

class HtTrackTests(BaseCrawlerTests):
    """
    Test suite for the HTTrack crawler implementation.
    Uses all wrapped test methods from BaseCrawlerTests plus HTTrack-specific features.
    """

    def setUp(self):
        """
        Set up the test environment with fixture data.
        """
        super().setUp()
        self._datasrc = get_fixture_directory() / "httrack"

    def test_httrack_pulse(self):
        """
        Test basic crawler initialization.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.assertIsNotNone(crawler)
        self.assertTrue(self._datasrc.is_dir())

    def test_httrack_sites(self):
        """
        Test site retrieval API functionality.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)

    def test_httrack_search(self):
        """
        Test boolean search functionality
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
        pass

    def test_httrack_resources(self):
        """
        Test resource retrieval API functionality with various arguments.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)

    def test_httrack_images(self):
        """
        Test HTTrack image handling and thumbnails.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)

    def test_httrack_sorts(self):
        """
        Test random sort functionality using the sort argument.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)

    def test_httrack_content_parsing(self):
        """
        Test content type detection and parsing.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)

    def test_httrack_tokenizer(self):
        """
        Test HTTrack-specific tokenizer functionality for hyphenated terms.
        """
        crawler = HtTrackCrawler(self._datasrc)
        self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)

    def test_httrack_log_parsing_features(self):
        """
        Test HTTrack-specific features related to hts-log.txt parsing.
        """
        crawler = HtTrackCrawler(self._datasrc)

        # Test that 404 errors from log are properly indexed
        error_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="status: 404"
        )
        if error_resources.total > 0:
            for resource in error_resources._results:
                self.assertEqual(resource.status, 404, "404 status should be preserved from log parsing")

        # Test that redirects are properly indexed
        redirect_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="status: 302"
        )
        if redirect_resources.total > 0:
            for resource in redirect_resources._results:
                self.assertEqual(resource.status, 302, "Redirect status should be detected from log")

        # Test successful resources default to 200
        success_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="status: 200",
            limit=5
        )
        self.assertTrue(success_resources.total > 0, "Should have successful resources with status 200")
        for resource in success_resources._results:
            self.assertEqual(resource.status, 200)

    def test_httrack_url_reconstruction(self):
        """
        Test HTTrack URL reconstruction from project and domain structure.
        """
        crawler = HtTrackCrawler(self._datasrc)

        # Get all resources to test URL patterns
        all_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            limit=10
        )
        self.assertTrue(all_resources.total > 0, "Should have resources with reconstructed URLs")

        for resource in all_resources._results:
            # URLs should be properly formatted
            self.assertTrue(resource.url.startswith("https://"),
                          f"URL should start with https://: {resource.url}")

            # URLs should not contain file system artifacts
            self.assertNotIn("\\", resource.url, "URLs should not contain backslashes")
            self.assertNotIn("hts-", resource.url, "URLs should not contain HTTrack artifacts")

    def test_httrack_domain_detection(self):
        """
        Test HTTrack domain directory detection and multi-domain handling.
        """
        crawler = HtTrackCrawler(self._datasrc)
        sites_result = crawler.get_sites_api()
        self.assertTrue(sites_result.total > 0, "Should detect HTTrack project directories as sites")

        specific_site = crawler.get_sites_api(ids=[PRAGMAR_SITE_ID])
        if specific_site.total > 0:
            site_data = specific_site._results[0].to_dict()
            self.assertIn("urls", site_data, "Site should have URLs")
            self.assertTrue(len(site_data["urls"]) > 0, "Site should have at least one valid URL")

    def test_httrack_file_exclusion(self):
        """
        Test that HTTrack-generated files are properly excluded.
        """
        crawler = HtTrackCrawler(self._datasrc)

        # Search for any resources that might be HTTrack artifacts
        all_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="",
            limit=50
        )

        for resource in all_resources._results:
            # Should not find project-level index.html (HTTrack-generated)
            if resource.url.endswith("/index.html"):
                # This should be domain-level index.html, not project-level
                self.assertNotEqual(resource.url, "https://pragmar/index.html",
                                  "Should not index project-level HTTrack-generated index.html")

            # Should not find hts-log.txt as a resource
            self.assertNotIn("hts-log.txt", resource.url, "Should not index hts-log.txt as resource")
            self.assertNotIn("hts-cache", resource.url, "Should not index hts-cache contents as resources")

    def test_httrack_advanced_features(self):
        """
        Test HTTrack-specific advanced features not covered by base tests.
        """
        crawler = HtTrackCrawler(self._datasrc)

        # Test field retrieval with HTTrack-specific metadata
        field_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="type: html",
            fields=["content", "headers", "created", "modified"],
            limit=3
        )

        if field_resources.total > 0:
            resource_dict = field_resources._results[0].to_dict()

            # Test timestamps from file system
            self.assertIn("created", resource_dict, "Should have created timestamp from file stat")
            self.assertIn("modified", resource_dict, "Should have modified timestamp from file stat")

            # Test headers generation
            if "headers" in resource_dict and resource_dict["headers"]:
                headers = resource_dict["headers"]
                self.assertIn("Content-Type:", headers, "Should have generated Content-Type header")
                self.assertIn("Content-Length:", headers, "Should have generated Content-Length header")

        # Test that resources have proper size information
        size_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            fields=["size"],
            limit=5
        )

        if size_resources.total > 0:
            for resource in size_resources._results:
                resource_dict = resource.to_dict()
                self.assertIn("size", resource_dict, "Resource should have size field")
                self.assertGreaterEqual(resource_dict["size"], 0, "Size should be non-negative")

    def test_report(self):
        """
        Run test report, save to data directory.
        """
        crawler = HtTrackCrawler(self._datasrc)
        logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "HTTrack"))
```
Page 2/33FirstPrevNextLast