This is page 3 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl?page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/interrobot/adapter.py:
--------------------------------------------------------------------------------
```python
import re
import sqlite3
import traceback
from contextlib import closing
from logging import Logger
from pathlib import Path
from typing import Final
from urllib.parse import urlparse
from mcp_server_webcrawl.crawlers.base.adapter import IndexState, IndexStatus, BaseManager, SitesGroup
from mcp_server_webcrawl.models.resources import ResourceResult, RESOURCES_LIMIT_DEFAULT
from mcp_server_webcrawl.models.sites import SiteResult, SiteType
from mcp_server_webcrawl.utils import from_isoformat_zulu
from mcp_server_webcrawl.utils.logger import get_logger
# maybe dedupe with near match RESOURCES version
INTERROBOT_RESOURCE_FIELD_MAPPING: Final[dict[str, str]] = {
"id": "ResourcesFullText.Id",
"site": "ResourcesFullText.Project",
"created": "Resources.Created",
"modified": "Resources.Modified",
"url": "ResourcesFullText.Url",
"status": "ResourcesFullText.Status",
"size": "Resources.Size",
"type": "ResourcesFullText.Type",
"headers": "ResourcesFullText.Headers",
"content": "ResourcesFullText.Content",
"time": "ResourcesFullText.Time"
}
INTERROBOT_SITE_FIELD_REQUIRED: Final[set[str]] = set(["id", "name", "type", "urls"])
# legit different from default version (extra robots)
INTERROBOT_SITE_FIELD_MAPPING: Final[dict[str, str]] = {
"id": "Project.Id",
"name": "Project.Name",
"type": "Project.Type",
"urls": "Project.Urls",
"created": "Project.Created",
"modified": "Project.Modified",
}
logger: Logger = get_logger()
class InterroBotManager(BaseManager):
"""
Manages HTTP text files in in-memory SQLite databases.
Provides connection pooling and caching for efficient access.
"""
def __init__(self) -> None:
"""Initialize the HTTP text manager with empty cache and statistics."""
super().__init__()
def get_connection(self, group: SitesGroup) -> tuple[sqlite3.Connection | None, IndexState]:
"""
Get database connection for sites in the group, creating if needed.
Args:
group: Group of sites to connect to
Returns:
Tuple of (SQLite connection to in-memory database with data loaded or None if building,
IndexState associated with this database)
"""
index_state = IndexState()
index_state.set_status(IndexStatus.REMOTE)
connection: sqlite3.Connection
try:
# note, responsible for implementing closing() on other side
connection = sqlite3.connect(group.datasrc)
except sqlite3.Error as ex:
logger.error(f"SQLite error reading database: {ex}\n{traceback.format_exc()}")
except (FileNotFoundError, PermissionError) as ex:
logger.error(f"Database access error: {group.datasrc}\n{traceback.format_exc()}")
raise
except Exception as ex:
logger.error(f"Unexpected error reading database {group.datasrc}: {ex}\n{traceback.format_exc()}")
raise
return connection, index_state
manager: InterroBotManager = InterroBotManager()
def get_sites(datasrc: Path, ids=None, fields=None) -> list[SiteResult]:
"""
Get sites based on the provided parameters.
Args:
datasrc: path to the database
ids: optional list of site IDs
fields: list of fields to include in response
Returns:
List of SiteResult objects
"""
site_fields_required: list[str] = ["id", "name", "type", "urls"]
site_fields_default: list[str] = site_fields_required + ["created", "modified"]
site_fields_available: list[str] = list(INTERROBOT_SITE_FIELD_MAPPING.keys())
# build query
params: dict[str, int | str] = {}
# these inputs are named parameters
ids_clause: str = ""
if ids and isinstance(ids, list) and len(ids) > 0:
placeholders: list[str] = [f":id{i}" for i in range(len(ids))]
ids_clause: str = f" WHERE Project.Id IN ({','.join(placeholders)})"
params.update({f"id{i}": id_val for i, id_val in enumerate(ids)})
# these inputs are not parameterized
# fields will be returned from database, if found in INTERROBOT_SITE_FIELD_MAPPING
selected_fields = set(site_fields_required)
if fields and isinstance(fields, list):
selected_fields.update(f for f in fields if f in site_fields_available)
else:
selected_fields.update(site_fields_default)
safe_sql_fields = [INTERROBOT_SITE_FIELD_MAPPING[f] for f in selected_fields]
assert all(re.match(r"^[A-Za-z\.]+$", field) for field in safe_sql_fields), "Unknown or unsafe field requested"
safe_sql_fields_joined: str = ", ".join(safe_sql_fields)
statement: str = f"SELECT {safe_sql_fields_joined} FROM Projects AS Project{ids_clause} ORDER BY Project.Name ASC"
sql_results: list[dict[str, int | str | None]] = []
try:
if not statement.strip().upper().startswith("SELECT"):
logger.error("Unauthorized SQL statement")
raise ValueError("Only SELECT queries are permitted")
with closing(sqlite3.connect(datasrc)) as conn:
conn.row_factory = sqlite3.Row
with closing(conn.cursor()) as cursor:
cursor.execute(statement, params or {})
sql_results = [{k.lower(): v for k, v in dict(row).items()} for row in cursor.fetchall()]
except sqlite3.Error as ex:
logger.error(f"SQLite error reading database: {ex}\n{traceback.format_exc()}")
return []
except Exception as ex:
logger.error(f"Database error: {ex}")
return []
results: list[SiteResult] = []
#for row in sql_results:
# results.append(SiteResult(
# path=datasrc,
# id=row.get("id"),
# url=row.get("url", ""),
# created=from_isoformat_zulu(row.get("created")),
# modified=from_isoformat_zulu(row.get("modified")),
# robots=row.get("robotstext"),
# metadata=None,
# ))
for row in sql_results:
urls_list = __urls_from_text(row.get("urls", ""))
site_type: SiteType
db_type = row.get("type")
if db_type == 1:
site_type = SiteType.CRAWLED_URL
elif db_type == 2:
site_type = SiteType.CRAWLED_LIST
else:
site_type = SiteType.UNDEFINED
results.append(SiteResult(
path=datasrc,
id=row.get("id"),
name=row.get("name"), # NEW: directly from DB
type=site_type, # NEW: from DB (needs mapping)
urls=urls_list, # CHANGED: split into list
created=from_isoformat_zulu(row.get("created")),
modified=from_isoformat_zulu(row.get("modified")),
robots=None, # Removed - not in new model
metadata=None,
))
return results
def __urls_from_text(urls: str) -> list[str]:
urls_list = []
if urls:
for url in urls.split('\n'):
url = url.strip()
if url:
try:
parsed = urlparse(url)
if parsed.scheme:
urls_list.append(url)
except Exception:
continue
return urls_list
def get_resources(
datasrc: Path,
sites: list[int] | None = None,
query: str = "",
fields: list[str] | None = None,
sort: str | None = None,
limit: int = RESOURCES_LIMIT_DEFAULT,
offset: int = 0,
) -> tuple[list[ResourceResult], int, IndexState]:
"""
Get resources from wget directories using in-memory SQLite.
Args:
datasrc: path to the directory containing wget captures
sites: optional list of site IDs to filter by
query: search query string
fields: optional list of fields to include in response
sort: sort order for results
limit: maximum number of results to return
offset: number of results to skip for pagination
Returns:
Tuple of (list of ResourceResult objects, total count)
"""
sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
assert sites_results, "At least one site is required to search"
site_paths = [site.path for site in sites_results]
sites_group = SitesGroup(datasrc, sites, site_paths)
# InterroBot uses ints in place of strings
swap_values = {
"type" : {
"": 0, # UNDEFINED
"html": 1, # PAGE
"other": 2, # OTHER (could also be 5 or 12 depending on context)
"rss": 3, # FEED
"iframe": 4, # FRAME
"img": 6, # IMAGE
"audio": 7, # AUDIO
"video": 8, # VIDEO
"font": 9, # FONT
"style": 10, # CSS
"script": 11, # SCRIPT
"text": 13, # TEXT
"pdf": 14, # PDF
"doc": 15 # DOC
}
}
return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset, swap_values)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/snippets.py:
--------------------------------------------------------------------------------
```python
import re
import lxml.html
from lxml import etree
from lxml.etree import ParserError
from logging import Logger
from typing import Final
from mcp_server_webcrawl.utils.logger import get_logger
from mcp_server_webcrawl.utils.search import SearchQueryParser
MAX_SNIPPETS_MATCHED_COUNT: Final[int] = 15
MAX_SNIPPETS_RETURNED_COUNT: Final[int] = 3
MAX_SNIPPETS_CONTEXT_SIZE: Final[int] = 48
__RE_SNIPPET_START_TRIM: Final[re.Pattern] = re.compile(r"^[^\w\[]+")
__RE_SNIPPET_END_TRIM: Final[re.Pattern] = re.compile(r"[^\w\]]+$")
logger: Logger = get_logger()
class SnippetContentExtractor:
"""
lxml-based HTML parser for extracting different types of content from HTML.
Content separates into components: text, markup, attributes (values), and comments.
These can be prioritized in search so that text is the displayed hit over noisier
types.
"""
PRIORITY_ORDER: list[str] = ["url", "document_text", "document_attributes",
"document_comments", "headers", "document_markup"]
__RE_SPLIT: re.Pattern = re.compile(r"[\s_]+|(?<!\w)-(?!\w)")
__RE_WHITESPACE: re.Pattern = re.compile(r"\s+")
__MAX_CONTENT_BYTES: int = 2 * 1024 * 1024 # 2MB
def __init__(self, url: str, headers: str, content: str):
self.__document: lxml.html.HtmlElement | None = None
self.url: str = url
self.content: str = ""
# headers one liner to facilitate snippet
self.headers: str = re.sub(r"\s+", " ", headers).strip()
self.document_text: str = ""
self.document_markup: str = ""
self.document_attributes: str = ""
self.document_comments: str = ""
if len(content) > self.__MAX_CONTENT_BYTES:
# ignore large files, slow
return
else:
self.content = content
load_success: bool = self.__load_content()
if load_success == True:
_ = self.__extract()
else:
self.document_text = self.__normalize_whitespace(self.content)
def __load_content(self) -> bool:
"""
Load content string into lxml doc.
"""
if not self.content or not self.content.strip():
return False
try:
self.__document = lxml.html.fromstring(self.content.encode("utf-8"))
return True
except (ParserError, ValueError, UnicodeDecodeError):
try:
wrapped_content = f"<html><body>{self.content}</body></html>"
self.__document = lxml.html.fromstring(wrapped_content.encode("utf-8"))
return True
except (ParserError, ValueError, UnicodeDecodeError):
return False
def __extract(self) -> bool:
"""
Extract content from lxml doc.
"""
if self.__document is None:
return False
text_values = []
markup_values = []
attribute_values = []
comment_values = []
element: lxml.html.HtmlElement | None = None
for element in self.__document.iter():
# HTML outliers
if element.tag is etree.Comment or element.tag is etree.ProcessingInstruction:
if element.text is not None:
comment_values.append(str(element.text.strip()))
# avoid regular element text processing
continue
if element.tag is etree.Entity or element.tag is etree.CDATA:
if element.text is not None:
text_values.append(str(element.text.strip()))
continue
# HTML tags and attributes
if element.tag:
markup_values.append(element.tag)
if element.tag in ("script", "style"):
continue
if element.text:
text_values.append(element.text.strip())
if element.tail:
text_values.append(element.tail.strip())
for attr_name, attr_value in element.attrib.items():
markup_values.append(attr_name)
if attr_value:
values = [v for v in self.__RE_SPLIT.split(attr_value) if v]
attribute_values.extend(values)
self.document_text = self.__normalize_values(text_values)
self.document_markup = self.__normalize_values(markup_values)
self.document_attributes = self.__normalize_values(attribute_values)
self.document_comments = self.__normalize_values(comment_values)
return True
def __normalize_values(self, values: list[str]) -> str:
"""
Concatenate values and normalize whitespace for list of values.
"""
text = " ".join([value for value in values if value])
return self.__normalize_whitespace(text)
def __normalize_whitespace(self, text: str) -> str:
"""
Normalize whitespace using pre-compiled pattern.
"""
return self.__RE_WHITESPACE.sub(" ", text).strip()
def get_snippets(url: str, headers: str, content: str, query: str) -> str | None:
"""
Takes a query and content, reduces the HTML to text content and extracts hits
as excerpts of text.
Arguments:
headers: Header content to search
content: The HTML or text content to search in
query: The search query string
Returns:
A string of snippets with context around matched terms, separated by " ... " or None
"""
if query in (None, ""):
return None
url = url or ""
content = content or ""
headers = headers or ""
search_terms_parser = SearchQueryParser()
search_terms: list[str] = search_terms_parser.get_fulltext_terms(query)
if not search_terms:
return None
snippets = []
search_terms_parser = SnippetContentExtractor(url, headers, content)
# priority order url, text, attributes, comments, headers, markup
# most interesting to least, as search hits
for group_name in search_terms_parser.PRIORITY_ORDER:
search_group_text = getattr(search_terms_parser, group_name)
if not search_group_text:
continue
group_snippets = find_snippets_in_text(search_group_text, search_terms,
max_snippets=MAX_SNIPPETS_MATCHED_COUNT+1, group_name=group_name)
snippets.extend(group_snippets)
if len(snippets) > MAX_SNIPPETS_MATCHED_COUNT:
break
if snippets:
total_snippets = len(snippets)
displayed_snippets = snippets[:MAX_SNIPPETS_RETURNED_COUNT]
result = " ... ".join(displayed_snippets)
if total_snippets > MAX_SNIPPETS_MATCHED_COUNT:
result += f" ... + >{MAX_SNIPPETS_MATCHED_COUNT} more"
elif total_snippets > MAX_SNIPPETS_RETURNED_COUNT:
remaining = total_snippets - MAX_SNIPPETS_RETURNED_COUNT
result += f" ... +{remaining} more"
return result
return None
def find_snippets_in_text(
text: str,
terms: list[str],
max_snippets: int = MAX_SNIPPETS_MATCHED_COUNT,
group_name: str = "") -> list[str]:
"""
Searches for whole-word matches of the given terms in the text and extracts
surrounding context to create highlighted snippets. Each snippet shows the matched term
in context with markdown-style bold highlighting (**term**).
Args:
text: The text to search within
terms: List of search terms to find (case-insensitive, whole words only)
max_snippets: Maximum number of snippets to return (default: MAX_SNIPPETS_MATCHED_COUNT)
group_name: Regex group identifier (reserved for future use)
Returns:
List of unique snippet strings with matched terms highlighted using **bold** markdown.
Each snippet includes surrounding context up to MAX_SNIPPETS_CONTEXT_SIZE characters
on each side of the match. Returns empty list if no matches found or invalid input.
"""
if not text or not terms:
return []
snippets: list[str] = []
seen_snippets: set[str] = set()
text_lower: str = text.lower()
escaped_terms = [re.escape(term) for term in terms]
pattern: str = rf"\b({'|'.join(escaped_terms)})\b"
highlight_patterns: list[tuple[re.Pattern, str]] = [
(re.compile(rf"\b({re.escape(term)})\b",
re.IGNORECASE), term) for term in terms
]
matches = list(re.finditer(pattern, text_lower))
for match in matches:
if len(snippets) >= max_snippets:
break
context_start: int = max(0, match.start() - MAX_SNIPPETS_CONTEXT_SIZE)
context_end: int = min(len(text), match.end() + MAX_SNIPPETS_CONTEXT_SIZE)
if context_start > 0:
while context_start > 0 and text[context_start].isalnum():
context_start -= 1
if context_end < len(text):
while context_end < len(text) and text[context_end].isalnum():
context_end += 1
snippet: str = text[context_start:context_end].strip()
snippet = __RE_SNIPPET_START_TRIM.sub("", snippet)
snippet = __RE_SNIPPET_END_TRIM.sub("", snippet)
highlighted_snippet: str = snippet
for pattern, _ in highlight_patterns:
highlighted_snippet = pattern.sub(r"**\1**", highlighted_snippet)
if highlighted_snippet and highlighted_snippet not in seen_snippets:
seen_snippets.add(highlighted_snippet)
snippets.append(highlighted_snippet)
return snippets
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/base.py:
--------------------------------------------------------------------------------
```python
import re
import curses
from abc import abstractmethod
from typing import TYPE_CHECKING
from mcp_server_webcrawl import __name__ as module_name, __version__ as module_version
from mcp_server_webcrawl.interactive.ui import ThemeDefinition, ViewBounds
from mcp_server_webcrawl.models.resources import ResourceResult
from mcp_server_webcrawl.interactive.ui import safe_addstr
if TYPE_CHECKING:
from mcp_server_webcrawl.interactive.session import InteractiveSession
REGEX_DISPLAY_URL_CLEAN = re.compile(r"^https?://|/$")
OUTER_WIDTH_RIGHT_MARGIN = 1
LAYOUT_FOOTER_SEPARATOR = " | "
LAYOUT_FOOTER_SEPARATOR_LENGTH = len(LAYOUT_FOOTER_SEPARATOR)
MIN_TERMINAL_HEIGHT = 8
MIN_TERMINAL_WIDTH = 40
CONTENT_MARGIN = 4
class BaseCursesView:
"""
Base class for all views with common interface.
"""
def __init__(self, session: 'InteractiveSession'):
self.session = session
self.bounds = ViewBounds(x=0, y=0, width=0, height=0)
self._focused = False
self._selected_index: int = 0
@property
def focused(self) -> bool:
return self._focused
def set_bounds(self, bounds: ViewBounds):
"""
Set the rendering bounds for this view.
Args:
bounds: The ViewBounds object defining the drawing area
"""
self.bounds = bounds
def set_focused(self, focused: bool):
"""
Set the focus state for this view.
Args:
focused: True if this view should be focused, False otherwise
"""
self._focused = focused
@abstractmethod
def render(self, stdscr: curses.window) -> None:
"""
Render the view within its bounds.
Args:
stdscr: The curses window to render on
"""
pass
@abstractmethod
def handle_input(self, key: int) -> bool:
"""
Handle input. Return True if consumed, False to pass through.
Args:
key: The input key code
Returns:
bool: True if input was consumed, False to pass through
"""
pass
def focusable(self) -> bool:
"""
Return True if this view can receive focus.
Returns:
bool: True if this view can receive focus
"""
return True
def draw_outer_footer(self, stdscr: curses.window, text: str) -> None:
"""
Draw context-sensitive help footer with pipe-separated items.
Args:
stdscr: The curses window to draw on
text: The footer text to display (pipe-separated items)
"""
height, width = stdscr.getmaxyx()
footer_line: int = height - 1
footer_line_text: str = BaseCursesView._get_full_width_line(stdscr)
outer_theme_pair: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
safe_addstr(stdscr, footer_line, 0, footer_line_text, outer_theme_pair)
items = [item.strip() for item in text.split(LAYOUT_FOOTER_SEPARATOR)]
available_width = width - 4 - 2 # 4 for right margin, 2 for left padding
display_text: str = ""
test_text: str = ""
test_text_length: int = 0
for i in range(len(items)):
test_text = LAYOUT_FOOTER_SEPARATOR.join(items[:i+1])
test_text_length = len(test_text)
if test_text_length <= available_width:
display_text = test_text
else:
break
# doesn't fit indicator
display_text_length: int = len(display_text)
if test_text_length > available_width:
display_text += f"{(width - display_text_length - 5) * ' '} »"
if display_text:
outer_header_theme_pair: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
safe_addstr(stdscr, footer_line, 1, display_text, outer_header_theme_pair)
def draw_outer_header(self, stdscr: curses.window) -> None:
"""
Draw the inner header for this view section.
Args:
stdscr: The curses window to draw on
"""
_, width = stdscr.getmaxyx()
style: int = self.session.get_theme_color_pair(ThemeDefinition.HEADER_OUTER)
full_width_line: str = BaseCursesView._get_full_width_line(stdscr)
header_label_text: str = f"{module_name} --interactive"
header_version_text: str = f"v{module_version}"
header_version_x: int = max(0, width - len(header_version_text) - 2)
safe_addstr(stdscr, 0, 0, full_width_line, style)
if len(header_label_text) < width - 2:
safe_addstr(stdscr, 0, 1, header_label_text, style)
if header_version_x > len(header_label_text) + 3:
safe_addstr(stdscr, 0, header_version_x, header_version_text, style)
def draw_inner_footer(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
"""
Draw context-sensitive help footer.
Args:
stdscr: The curses window to draw on
bounds: The view bounds defining the drawing area
text: The footer text to display
"""
footer_y: int = bounds.y + bounds.height - 1
line_of_whitespace: str = self._get_bounded_line()
display_text: str = text or ""
display_text_max: int = len(line_of_whitespace) - 2
if len(display_text) > display_text_max:
display_text = f"{display_text[:display_text_max - 1]}…"
line: str = f" {display_text}".ljust(len(line_of_whitespace))
safe_addstr(stdscr, footer_y, bounds.x, line, self._get_inner_header_style())
def draw_inner_header(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
"""
Draw the application header with module name and version.
Args:
stdscr: The curses window to draw on
bounds: The view bounds defining the drawing area
text: The header text to display
"""
line_of_whitespace: str = self._get_bounded_line()
display_text: str = text or ""
max_text_width: int = len(line_of_whitespace) - 2
if len(display_text) > max_text_width:
display_text = f"{display_text[:max_text_width - 1]}…"
line: str = f" {display_text}".ljust(len(line_of_whitespace))
safe_addstr(stdscr, bounds.y, bounds.x, line, self._get_inner_header_style())
@staticmethod
def _get_full_width_line(stdscr: curses.window) -> str:
"""
Get a line that fills the terminal width.
Args:
stdscr: The curses window to get dimensions from
Returns:
str: A string of spaces filling the terminal width
"""
_, width = stdscr.getmaxyx()
return " " * (width - OUTER_WIDTH_RIGHT_MARGIN)
@staticmethod
def url_for_display(url: str) -> str:
"""
Remove protocol prefix and trailing slash from URL for display.
Args:
url: The URL to clean for display
Returns:
str: The cleaned URL without protocol and trailing slash
"""
return REGEX_DISPLAY_URL_CLEAN.sub("", url)
@staticmethod
def humanized_bytes(result: ResourceResult) -> str:
"""
Convert resource size to human-readable format (B, KB, MB).
Args:
result: The ResourceResult containing size information
Returns:
str: Human-readable size string (e.g., "1.5MB", "512KB", "128B")
"""
display: str = ""
if result is not None:
size: int = result.size
if isinstance(size, int):
if size >= 1024 * 1024:
display = f"{size/(1024*1024):.1f}MB"
elif size >= 1024:
display = f"{size/1024:.1f}KB"
else:
display = f"{size}B"
return display
def _get_inner_header_style(self) -> int:
"""
Get the appropriate header style based on focus state.
Returns:
int: The theme color pair for the header
"""
if self._focused == True:
return self.session.get_theme_color_pair(ThemeDefinition.HEADER_ACTIVE)
else:
return self.session.get_theme_color_pair(ThemeDefinition.HEADER_INACTIVE)
def _get_input_style(self) -> int:
"""
Get the appropriate input style based on focus and selection state.
Returns:
int: The style attributes for input rendering
"""
if self._focused and self._selected_index == 0:
return curses.A_REVERSE
else:
return self.session.get_theme_color_pair(ThemeDefinition.INACTIVE_QUERY)
def _get_bounded_line(self) -> str:
"""
Get a line of spaces that fits within the view bounds.
Returns:
str: A string of spaces matching the view width
"""
return " " * self.bounds.width
def _renderable(self, stdscr: curses.window) -> bool:
"""
Check if the view can be rendered within the current terminal bounds.
Args:
stdscr: The curses window to check dimensions against
Returns:
bool: True if the view can be rendered, False otherwise
"""
terminal_height, terminal_width = stdscr.getmaxyx()
if self.bounds.y >= terminal_height or self.bounds.x >= terminal_width or self.bounds.width <= 0 or self.bounds.height <= 0:
return False
return True
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/tests.py:
--------------------------------------------------------------------------------
```python
from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
from mcp_server_webcrawl.crawlers.archivebox.adapter import ArchiveBoxManager
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger
# calculate ids for ArchiveBox working directories using the same hash function as adapter
EXAMPLE_SITE_ID = ArchiveBoxManager.string_to_id("example")
PRAGMAR_SITE_ID = ArchiveBoxManager.string_to_id("pragmar")
logger = get_logger()
class ArchiveBoxTests(BaseCrawlerTests):
"""
Test suite for the ArchiveBox crawler implementation.
Uses wrapped test methods from BaseCrawlerTests adapted for ArchiveBox's multi-instance structure.
"""
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "archivebox"
def test_archivebox_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
def test_archivebox_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# should have multiple sites (example and pragmar working directories)
sites_json = crawler.get_sites_api()
self.assertGreaterEqual(sites_json.total, 2, "ArchiveBox should have multiple working directories as sites")
# test pragmar site specifically
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
def test_archivebox_search(self):
"""
Test boolean search functionality.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
def test_pragmar_tokenizer(self):
"""
Test tokenizer search functionality.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
def test_archivebox_resources(self):
"""
Test resource retrieval API functionality with various parameters.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
def test_interrobot_images(self):
"""
Test InterroBot-specific image handling and thumbnails.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
def test_archivebox_sorts(self):
"""
Test random sort functionality using the '?' sort parameter.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
def test_archivebox_content_parsing(self):
"""
Test content type detection and parsing for ArchiveBox resources.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
def test_archivebox_url_reconstruction(self):
"""
Test URL reconstruction from ArchiveBox metadata.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
url_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=20)
self.assertGreater(url_resources.total, 0, "Should have resources with reconstructed URLs")
for resource in url_resources._results:
# URLs should be valid HTTP/HTTPS (except for archivebox:// fallbacks)
self.assertTrue(
resource.url.startswith(('http://', 'https://', 'archivebox://')),
f"URL should have valid scheme: {resource.url}"
)
# should not end with index.html (stripped during reconstruction)
self.assertFalse(
resource.url.endswith('/index.html'),
f"URL should not end with index.html: {resource.url}"
)
def test_archivebox_deduplication(self):
"""
Test resource deduplication across timestamped entries.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get all resources from pragmar site
all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=100)
self.assertGreater(all_resources.total, 0, "Should have resources")
# check for URL uniqueness (deduplication should ensure unique URLs)
urls_found = [r.url for r in all_resources._results]
unique_urls = set(urls_found)
# should have deduplication working (though some URLs might legitimately appear multiple times
# if they're different resources, like different timestamps of the same page)
self.assertLessEqual(len(unique_urls), len(urls_found),
"URL deduplication should work properly")
def test_archivebox_metadata_parsing(self):
"""
Test JSON metadata parsing from ArchiveBox files.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get resources with headers from pragmar site
header_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
fields=["headers"],
limit=10
)
if header_resources.total > 0:
headers_found = 0
for resource in header_resources._results:
resource_dict = resource.to_dict()
if "headers" in resource_dict and resource_dict["headers"]:
headers_found += 1
self.assertIn("HTTP/1.0", resource_dict["headers"],
"Headers should contain HTTP status line")
# at least some resources should have parsed headers
self.assertGreater(headers_found, 0, "Should find resources with parsed headers")
def test_archivebox_timestamped_structure(self):
"""
Test handling of ArchiveBox's timestamped entry structure.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get resources with timestamps from pragmar site
timestamp_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
fields=["created", "modified"],
limit=10
)
self.assertGreater(timestamp_resources.total, 0, "Should have timestamped resources")
for resource in timestamp_resources._results:
resource_dict = resource.to_dict()
# should have timestamp information
self.assertIsNotNone(resource_dict.get("created"),
"Should have created timestamp from entry directory")
self.assertIsNotNone(resource_dict.get("modified"),
"Should have modified timestamp from entry directory")
def test_archivebox_error_resilience(self):
"""
Test resilience to malformed JSON and missing files.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# should continue processing despite any JSON parsing errors
all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID])
# verify we got some resources despite potential errors
self.assertGreater(all_resources.total, 0,
"Should process entries even with JSON parsing errors")
# verify resources have reasonable defaults
for resource in all_resources._results:
self.assertIsNotNone(resource.url, "URL should always be set")
self.assertIsInstance(resource.status, int, "Status should be integer")
self.assertGreaterEqual(resource.status, 0, "Status should be non-negative")
self.assertLessEqual(resource.status, 599, "Status should be valid HTTP status")
def test_archivebox_multi_site(self):
"""
Test that multiple ArchiveBox working directories are treated as separate sites.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get resources from each site separately
example_resources = crawler.get_resources_api(sites=[EXAMPLE_SITE_ID], limit=10)
pragmar_resources = crawler.get_resources_api(
query="url: pragmar.com",
sites=[PRAGMAR_SITE_ID],
limit=10)
# print(example_resources.to_dict())
# print(pragmar_resources.to_dict())
# both sites should have resources
self.assertGreater(example_resources.total, 0, "Example site should have resources")
self.assertGreater(pragmar_resources.total, 0, "Pragmar site should have resources")
# URLs should reflect the appropriate domains
example_urls = [r.url for r in example_resources._results]
pragmar_urls = [r.url for r in pragmar_resources._results]
# verify site separation (pragmar resources should be about pragmar.com)
pragmar_domain_urls = [url for url in pragmar_urls if "pragmar.com" in url]
self.assertGreater(len(pragmar_domain_urls), 0,
"Pragmar site should contain pragmar.com URLs")
def test_report(self):
"""
Run test report for ArchiveBox archive.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# generate report using pragmar site ID
report = self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "ArchiveBox")
logger.info(report)
# basic validation that report contains expected content
self.assertIn("ArchiveBox", report, "Report should mention ArchiveBox")
self.assertIn("Total pages:", report, "Report should show page counts")
```
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp-server-webcrawl — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Installation" href="installation.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="#" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="#">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="#" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">mcp-server-webcrawl</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/index.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<a class="reference internal image-reference" href="_images/mcpswc.svg"><img alt="mcp-server-webcrawl heading" class="align-center" src="_images/mcpswc.svg" width="100%" /></a>
<div style="text-align: center; margin-bottom: 2em;">
<a href="https://pragmar.com/mcp-server-webcrawl/" style="margin: 0 4px;">Website</a> |
<a href="https://github.com/pragmar/mcp-server-webcrawl" style="margin: 0 4px;">Github</a> |
<a href="https://pragmar.github.io/mcp-server-webcrawl/" style="margin: 0 4px;">Docs</a> |
<a href="https://pypi.org/project/mcp-server-webcrawl/" style="margin: 0 4px;">PyPi</a>
</div><section id="mcp-server-webcrawl">
<h1>mcp-server-webcrawl<a class="headerlink" href="#mcp-server-webcrawl" title="Link to this heading"></a></h1>
<p>Advanced search and retrieval for web crawler data. With <strong>mcp-server-webcrawl</strong>, your AI client filters
and analyzes web content under your direction or autonomously. The server includes a full-text search
interface with boolean support, and resource filtering by type, HTTP status, and more.</p>
<p><strong>mcp-server-webcrawl</strong> provides the LLM a complete menu with which to search your web content, and works with
a variety of web crawlers:</p>
<table class="docutils align-default" id="id7">
<caption><span class="caption-text">Supported Crawlers</span><a class="headerlink" href="#id7" title="Link to this table"></a></caption>
<colgroup>
<col style="width: 30.0%" />
<col style="width: 50.0%" />
<col style="width: 20.0%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Crawler/Format</p></th>
<th class="head"><p>Description</p></th>
<th class="head"><p>Setup Guide</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><a class="reference external" href="https://archivebox.io">ArchiveBox</a></p></td>
<td><p>Self-hosted web archiving tool</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/archivebox.html">Setup Guide</a></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference external" href="https://www.httrack.com">HTTrack</a></p></td>
<td><p>GUI/CLI website mirroring tool</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/httrack.html">Setup Guide</a></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference external" href="https://interro.bot">InterroBot</a></p></td>
<td><p>GUI crawler and analyzer</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/interrobot.html">Setup Guide</a></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference external" href="https://github.com/projectdiscovery/katana">Katana</a></p></td>
<td><p>CLI security-focused crawler</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/katana.html">Setup Guide</a></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference external" href="https://crawler.siteone.io">SiteOne</a></p></td>
<td><p>GUI crawler and analyzer</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/siteone.html">Setup Guide</a></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference external" href="https://en.wikipedia.org/wiki/WARC_(file_format)">WARC</a></p></td>
<td><p>Standard web archive format</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/warc.html">Setup Guide</a></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference external" href="https://en.wikipedia.org/wiki/Wget">wget</a></p></td>
<td><p>CLI website mirroring tool</p></td>
<td><p><a class="reference external" href="https://pragmar.github.io/mcp-server-webcrawl/guides/wget.html">Setup Guide</a></p></td>
</tr>
</tbody>
</table>
<p><strong>mcp-server-webcrawl</strong> is free and open source, and requires Claude Desktop, Python (>=3.10). It is installed on the command line, via pip install:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>mcp-server-webcrawl
</pre></div>
</div>
<iframe width="560" height="315" style="display: block;margin-bottom:1rem;" src="https://www.youtube.com/embed/Sid-GBxII1o" frameborder="0" allowfullscreen></iframe><div class="toctree-wrapper compound">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
<section id="indices-and-tables">
<h2>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></p></li>
<li><p><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></p></li>
<li><p><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></p></li>
</ul>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="installation.html" class="btn btn-neutral float-right" title="Installation" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/warc/adapter.py:
--------------------------------------------------------------------------------
```python
import email.utils
import os
import sqlite3
import warcio
from contextlib import closing
from datetime import datetime, timezone
from pathlib import Path
from typing import Final
from warcio.recordloader import ArcWarcRecord
from mcp_server_webcrawl.crawlers.base.adapter import (
IndexState,
IndexStatus,
SitesGroup,
INDEXED_BATCH_SIZE,
INDEXED_WARC_EXTENSIONS,
)
from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
from mcp_server_webcrawl.models.resources import (
ResourceResult,
ResourceResultType,
RESOURCES_LIMIT_DEFAULT,
)
from mcp_server_webcrawl.models.sites import (
SiteResult,
SiteType,
SITES_FIELDS_DEFAULT,
SITES_FIELDS_BASE,
)
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
class WarcManager(IndexedManager):
"""
Manages WARC file data in in-memory SQLite databases.
Provides connection pooling and caching for efficient access.
"""
def __init__(self) -> None:
"""Initialize the WARC manager with empty cache and statistics."""
super().__init__()
def _load_site_data(self, connection: sqlite3.Connection, warc_path: Path,
site_id: int, index_state: IndexState = None) -> None:
"""
Load a WARC file into the database with batch processing for better performance.
Args:
connection: SQLite connection
warc_path: path to the WARC file
site_id: ID for the site
index_state: IndexState object for tracking progress
"""
if not warc_path.exists() or not warc_path.is_file():
logger.error(f"WARC file not found or not a file: {warc_path}")
return
with closing(connection.cursor()) as cursor:
if index_state is not None:
index_state.set_status(IndexStatus.INDEXING)
try:
batch_insert_resource_results: list[ResourceResult] = []
batch_count: int = 0
with open(warc_path, "rb") as stream:
for warc_record in warcio.ArchiveIterator(stream):
if index_state is not None and index_state.is_timeout():
index_state.set_status(IndexStatus.PARTIAL)
# commit current batch and shut it down
if batch_insert_resource_results:
self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
return
if warc_record is not None and warc_record.rec_type == "response":
resource_result: ResourceResult = self._prepare_warc_record(warc_record, site_id)
if resource_result:
batch_insert_resource_results.append(resource_result)
if index_state is not None:
index_state.increment_processed()
batch_count += 1
if batch_count >= INDEXED_BATCH_SIZE:
self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
batch_insert_resource_results = []
batch_count = 0
# batch insert remaining
if batch_insert_resource_results:
self._execute_batch_insert(connection, cursor, batch_insert_resource_results)
if index_state is not None and index_state.status == IndexStatus.INDEXING:
index_state.set_status(IndexStatus.COMPLETE)
except Exception as ex:
logger.error(f"Error processing WARC file {warc_path}: {ex}")
if index_state is not None:
index_state.set_status(IndexStatus.FAILED)
def _prepare_warc_record(self, record: ArcWarcRecord, site_id: int) -> ResourceResult | None:
"""
Prepare a WARC record for batch insertion.
Args:
record: a warcio record object
site_id: ID for the site
Returns:
Tuple of values ready for insertion, or None if processing fails
"""
try:
url: str = record.rec_headers.get_header("WARC-Target-URI")
content_type: str = record.http_headers.get_header("Content-Type", "")
status: int = int(record.http_headers.get_statuscode()) or 200
resource_type: ResourceResultType = self._determine_resource_type(content_type)
content: bytes = record.content_stream().read()
content_size: int = len(content)
if self._is_text_content(content_type):
try:
content_str: str = content.decode("utf-8")
except UnicodeDecodeError:
content_str = None
else:
content_str = None
warc_date = record.rec_headers.get_header("WARC-Date")
if warc_date:
try:
file_created = datetime.fromisoformat(warc_date.replace('Z', '+00:00'))
except ValueError:
# Fallback to email date parser
try:
time_tuple = email.utils.parsedate_tz(warc_date)
file_created = datetime.fromtimestamp(email.utils.mktime_tz(time_tuple), tz=timezone.utc)
except (ValueError, TypeError):
file_created = datetime.now(timezone.utc)
else:
file_created = None # don't pretend it is now, ResourceResult can survive
file_modified = file_created # like file stat indexes, these are equivalent
result = ResourceResult(
id=IndexedManager.string_to_id(url),
site=site_id,
created=file_created,
modified=file_modified,
url=url,
type=resource_type,
status=status,
headers=record.http_headers.to_str(),
content=content_str,
size=content_size,
time=0 # time not available
)
return result
except Exception as ex:
logger.error(f"Error processing WARC record for URL {url if 'url' in locals() else 'unknown'}: {ex}")
return None
manager: WarcManager = WarcManager()
def get_sites(
datasrc: Path,
ids: list[int] | None = None,
fields: list[str] | None = None
) -> list[SiteResult]:
"""
List WARC files in the datasrc directory as sites.
Args:
datasrc: path to the directory containing WARC files
ids: optional list of site IDs to filter by
fields: list of fields to include in the response
Returns:
List of SiteResult objects, one for each WARC file
"""
assert datasrc is not None, f"datasrc not provided ({datasrc})"
# nothing can be done, but don't crash the server either, keep chugging along
if not datasrc.exists():
logger.error(f"Directory not found ({datasrc})")
return []
# determine which fields to include
selected_fields: set[str] = set(SITES_FIELDS_BASE)
if fields:
valid_fields: set[str] = set(SITES_FIELDS_DEFAULT)
selected_fields.update(f for f in fields if f in valid_fields)
else:
selected_fields.update(SITES_FIELDS_DEFAULT)
results: list[SiteResult] = []
files_to_check: list[Path] = []
for ext in INDEXED_WARC_EXTENSIONS:
files_to_check.extend(datasrc.glob(f"*{ext}"))
# map of file_id -> file_path for filtering
file_id_map: dict[int, Path] = {WarcManager.string_to_id(str(os.path.basename(f))): f for f in files_to_check if f is not None}
if ids:
file_id_map = {id_val: path for id_val, path in file_id_map.items() if id_val in ids}
# for site_id, file_path in sorted(file_id_map.items()):
# file_stat = file_path.stat()
# created_time: datetime = datetime.fromtimestamp(file_stat.st_ctime)
# modified_time: datetime = datetime.fromtimestamp(file_stat.st_mtime)
# site: SiteResult = SiteResult(
# path=file_path,
# id=site_id,
# url=str(file_path.absolute()),
# created=created_time if "created" in selected_fields else None,
# modified=modified_time if "modified" in selected_fields else None,
# )
# results.append(site)
for site_id, file_path in sorted(file_id_map.items()):
file_stat = file_path.stat()
created_time: datetime = datetime.fromtimestamp(file_stat.st_ctime)
modified_time: datetime = datetime.fromtimestamp(file_stat.st_mtime)
site: SiteResult = SiteResult(
path=file_path,
id=site_id,
name=file_path.name, # NEW: just the filename
type=SiteType.CRAWLED_URL, # NEW: treated as single-site crawl
urls=[str(file_path.absolute())], # CHANGED: now a list (file path as the "URL")
created=created_time if "created" in selected_fields else None,
modified=modified_time if "modified" in selected_fields else None,
)
results.append(site)
return results
def get_resources(
datasrc: Path,
sites: list[int] | None = None,
query: str = "",
fields: list[str] | None = None,
sort: str | None = None,
limit: int = RESOURCES_LIMIT_DEFAULT,
offset: int = 0,
) -> tuple[list[ResourceResult], int, IndexState]:
"""
Get resources from wget directories using in-memory SQLite.
Args:
datasrc: path to the directory containing wget captures
sites: optional list of site IDs to filter by
query: search query string
fields: optional list of fields to include in response
sort: sort order for results
limit: maximum number of results to return
offset: number of results to skip for pagination
Returns:
Tuple of (list of ResourceResult objects, total count)
"""
sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
assert sites_results, "At least one site is required to search"
site_paths = [site.path for site in sites_results]
sites_group = SitesGroup(datasrc, sites, site_paths)
return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/search.py:
--------------------------------------------------------------------------------
```python
import hashlib
import threading
from concurrent.futures import ThreadPoolExecutor, Future
from datetime import datetime
from typing import Optional, TYPE_CHECKING
from mcp_server_webcrawl.crawlers.base.crawler import BaseJsonApi
from mcp_server_webcrawl.interactive.ui import UiFocusable, UiState
from mcp_server_webcrawl.models.resources import ResourceResult
if TYPE_CHECKING:
from mcp_server_webcrawl.interactive.session import InteractiveSession
SEARCH_DEBOUNCE_DELAY_SECONDS = 0.2
SEARCH_RESULT_LIMIT: int = 10
class SearchManager:
"""
Manages search operations including async search and debouncing.
Works with session's controlled interface - never touches private state directly.
"""
def __init__(self, session: 'InteractiveSession'):
self.__session: 'InteractiveSession' = session
self.__search_last_state_hash: str = ""
self.__search_timer: Optional[threading.Timer] = None
self.__executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="SearchManager")
self.__search_lock: threading.RLock = threading.RLock()
self.__search_in_progress: bool = False
self.__active_search_future: Optional[Future] = None
self.__pending_results: Optional[list[ResourceResult]] = None
self.__pending_indexer_status: str = ""
self.__pending_indexer_processed: int = 0
self.__pending_indexer_duration: float = 0
self.__pending_total: int = 0
def autosearch(self, immediate: bool = False) -> None:
"""
Trigger search with optional immediate execution.
Args:
immediate: If True, execute search synchronously without debouncing.
If False, use debounced async execution (default).
"""
current_state_hash: str = self.__get_input_hash()
if not immediate and current_state_hash == self.__search_last_state_hash:
return
self.__search_last_state_hash = current_state_hash
self.cancel_pending()
if immediate:
self.__execute_search_immediate()
else:
self.__search_timer = threading.Timer(SEARCH_DEBOUNCE_DELAY_SECONDS, self.__execute_debounced_search)
self.__search_timer.start()
def cancel_pending(self) -> None:
"""
Cancel any pending search timer.
"""
if self.__search_timer is not None:
self.__search_timer.cancel()
self.__search_timer = None
with self.__search_lock:
if self.__active_search_future is not None:
self.__active_search_future.cancel()
self.__active_search_future = None
def check_pending(self) -> None:
"""
Check if there are pending search results and update the UI.
"""
with self.__search_lock:
if self.__pending_results is not None:
self.__session.results.update(self.__pending_results, self.__pending_total, self.__pending_indexer_status,
self.__pending_indexer_processed, self.__pending_indexer_duration)
self.__pending_results = None
self.__pending_total = 0
self.__pending_indexer_processed = 0
self.__pending_indexer_duration = 0
def cleanup(self) -> None:
"""
Clean up any pending operations.
"""
self.cancel_pending()
self.__executor.shutdown(wait=True)
def has_pending(self) -> bool:
"""
Check if there's a pending debounced search.
"""
return self.__search_timer is not None
def is_searching(self) -> bool:
"""
Check if a search is currently in progress or on a timer.
"""
with self.__search_lock:
return self.__search_in_progress or self.__search_timer is not None
def __background_search(self) -> None:
"""
Execute search in background thread and store results.
"""
with self.__search_lock:
self.__search_in_progress = True
self.__session.searchform.set_search_attempted()
results, total_results, index_status, index_processed_count, index_duration_value = self.__execute_search_query()
self.__set_pending_results(results, total_results, index_status, index_processed_count, index_duration_value, False)
def __build_search_query(self, base_query: str) -> str:
"""
Build the final search query with filter applied (if present).
"""
if self.__session.searchform.filter == "html":
if base_query.strip():
return f"(type: html) AND {base_query}"
else:
return "type: html"
else:
return base_query
def __execute_debounced_search(self) -> None:
"""
Execute search after debounce delay in separate thread.
"""
current_state_hash: str = self.__get_input_hash()
if current_state_hash != self.__search_last_state_hash:
return
# show split view on results
if self.__session.ui_focused == UiFocusable.SEARCH_RESULTS:
self.__session.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
else:
self.__session.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_FORM)
self.__search_timer = None
with self.__search_lock:
self.__active_search_future = self.__executor.submit(self.__background_search)
def __execute_search_immediate(self) -> None:
"""
Execute search immediately on main thread (for ENTER key).
"""
self.__session.searchform.set_search_attempted()
self.__set_pending_results(None, 0, "", -1, -1, False)
self.__session.results.clear()
results, total_results, index_status, index_processed_count, index_duration_value = self.__execute_search_query()
self.__set_pending_results(results, total_results, index_status, index_processed_count, index_duration_value, False)
def __execute_search_query(self) -> tuple[list[ResourceResult], int, str, int, float]:
"""
Centralized search execution logic shared by both sync and async paths.
Returns:
tuple: (results, total_results, index_status, index_processed_count, index_duration_value)
"""
api: BaseJsonApi | None = self.__get_results(offset=self.__session.searchform.offset)
if api is None:
return [], 0, 0, 0
results: list[ResourceResult] = api.get_results()
total_results: int = api.total
index_status: str = ""
index_processed_count: int = -1
index_duration_value: float = -1
if api.meta_index is not None:
if "status" in api.meta_index:
index_status = api.meta_index["status"]
if "processed" in api.meta_index:
index_processed_count = api.meta_index["processed"]
if "duration" in api.meta_index:
index_duration_string: str = api.meta_index["duration"] or ""
if index_duration_string:
try:
dt: datetime = datetime.strptime(index_duration_string, "%H:%M:%S.%f")
index_duration_value = dt.hour * 3600 + dt.minute * 60 + dt.second + dt.microsecond / 1000000
except ValueError:
index_duration_value = 0
return results, total_results, index_status, index_processed_count, index_duration_value
def __get_input_hash(self) -> str:
"""
Generate a hash representing the complete current search state.
"""
query: str = self.__session.searchform.query.strip()
selected_sites = self.__session.searchform.get_selected_sites()
selected_sites_ids: list[int] = [s.id for s in selected_sites]
filter: str = str(self.__session.searchform.filter)
sort: str = str(self.__session.searchform.sort)
offset: int = self.__session.searchform.offset
limit: int = self.__session.searchform.limit
search_state: str = f"{query}|{selected_sites_ids}|{filter}|{offset}|{limit}|{sort}"
return hashlib.md5(search_state.encode()).hexdigest()
def __get_results(self, offset: int = 0) -> BaseJsonApi | None:
"""
Execute search with given offset and return API response object.
Centralizes the API call logic used by both sync and async search paths.
Args:
offset: Starting position for search results pagination
Returns:
BaseJsonApi: API response object containing search results and metadata
"""
selected_site_ids: list[int] = self.__get_selected_site_ids()
query: str = self.__build_search_query(self.__session.searchform.query)
sort: str = self.__session.searchform.sort
query_api: BaseJsonApi = self.__session.crawler.get_resources_api(
sites=selected_site_ids if selected_site_ids else None,
query=query,
fields=["size", "status"],
offset=offset,
limit=SEARCH_RESULT_LIMIT,
extras=["snippets"],
sort=sort
)
return query_api
def __get_selected_site_ids(self) -> list[int]:
"""
Get list of selected site IDs using property access.
"""
selected_sites = self.__session.searchform.get_selected_sites()
return [site.id for site in selected_sites]
def __set_pending_results(self, results, total_results, index_status, index_processed_count, index_duration_value, search_in_progress) -> None:
try:
with self.__search_lock:
self.__pending_results = results
self.__pending_total = total_results
self.__pending_indexer_status = index_status
self.__pending_indexer_processed = index_processed_count
self.__pending_indexer_duration = index_duration_value
self.__search_in_progress = search_in_progress
except Exception as ex:
with self.__search_lock:
self.__session.results.clear()
self.__search_in_progress = False
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/thumbnails.py:
--------------------------------------------------------------------------------
```python
import os
import aiohttp
import asyncio
import base64
import concurrent
import hashlib
import io
import re
import threading
import traceback
from datetime import datetime, timedelta
from pathlib import Path
from urllib.parse import ParseResult, urlparse
from PIL import Image
from mcp_server_webcrawl.settings import DATA_DIRECTORY
from mcp_server_webcrawl.utils.logger import get_logger
HTTP_THREADS: int = 8
ALLOWED_THUMBNAIL_TYPES = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
MAX_THUMBNAIL_BYTES = 2 * 1024 * 1024 # 2MB cap
logger = get_logger()
class ThumbnailManager:
"""
Manages thumbnail generation and caching for image files and URLs.
"""
def __init__(self):
DATA_DIRECTORY.mkdir(parents=True, exist_ok=True)
assert DATA_DIRECTORY.is_dir(), f"DATA_DIRECTORY {DATA_DIRECTORY} is not a directory"
self.__temp_directory: Path = DATA_DIRECTORY / "thumb"
if not self.__temp_directory.is_dir():
self.__temp_directory.mkdir(parents=True, exist_ok=True)
os.chmod(self.__temp_directory, 0o700)
def __md5(self, path: str) -> str:
return hashlib.md5(path.encode()).hexdigest()
def __is_valid_url(self, path: str) -> tuple[bool, ParseResult | None]:
try:
result = urlparse(path)
return all([result.scheme, result.netloc]), result
except:
return False, None
def __is_valid_file(self, path: str) -> bool:
return Path(path).is_file()
def __get_temp_file(self, key: str) -> Path:
return self.__temp_directory / f"{key}.webp"
def __get_extension(self, path: str) -> str | None:
ext = Path(path).suffix.lower()
if ext:
return ext
# try to parse extension from the path
is_valid, parsed = self.__is_valid_url(path)
if is_valid:
path_parts = parsed.path.split("/")
if path_parts:
last_part = path_parts[-1]
if "." in last_part:
return "." + last_part.split(".")[-1].lower()
return None
def __is_allowed_type(self, path: str) -> bool:
ext = self.__get_extension(path)
return ext in ALLOWED_THUMBNAIL_TYPES if ext else False
def __clean_thumbs_directory(self):
try:
md5_pattern: re.Pattern = re.compile(r"^[0-9a-f]{32}$")
cutoff_time: timedelta = datetime.now() - timedelta(hours=4)
deleted_count: int = 0
for file_path in self.__temp_directory.glob("*"):
if not file_path.is_file():
continue
if not md5_pattern.match(file_path.name):
continue
file_mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
if file_mtime < cutoff_time:
file_path.unlink()
deleted_count += 1
logger.info(f"Temporary file cleanup complete: {deleted_count} files deleted")
except Exception as ex:
logger.error(
f"Error during temporary file cleanup: {str(ex)}\n{traceback.format_exc()}"
)
def __check_content_length(self, headers) -> bool:
"""Helper to check if content length is acceptable"""
if "Content-Length" in headers:
content_length = int(headers["Content-Length"])
if content_length > MAX_THUMBNAIL_BYTES:
logger.info(
f"Skipping large file ({content_length} bytes > "
f"{MAX_THUMBNAIL_BYTES} bytes)"
)
return False
return True
async def __fetch_url(
self, session: aiohttp.ClientSession, url: str, key: str
) -> str | None:
temp_file = self.__get_temp_file(key)
try:
# check HEAD to get Content-Length without downloading
async with session.head(url, timeout=1, allow_redirects=True) as head_response:
if head_response.status == 200 and not self.__check_content_length(head_response.headers):
return None
async with session.get(url, timeout=2) as response:
if response.status != 200:
return None
if not self.__check_content_length(response.headers):
return None
# stream the content with a size limit
content = bytearray()
chunk_size = 8192 # 8KB chunks
total_size = 0
async for chunk in response.content.iter_chunked(chunk_size):
total_size += len(chunk)
if total_size > MAX_THUMBNAIL_BYTES:
logger.info(
f"Download exceeded size limit of {MAX_THUMBNAIL_BYTES} bytes "
f"while streaming"
)
return None
content.extend(chunk)
return self.__process_image_data(bytes(content), temp_file)
except (aiohttp.ClientError, asyncio.TimeoutError) as ex:
# http is the wild west, keep chugging
logger.debug(f"HTTP error: {str(ex)}")
return None
def __process_image_data(self, data: bytes, temp_file: Path) -> str | None:
"""Process image data, save to temp file, and return base64 encoding"""
thumbnail = self.__create_webp_thumbnail(data)
if thumbnail is not None:
temp_file.write_bytes(thumbnail)
return base64.b64encode(thumbnail).decode("utf-8")
return None
async def __get_file(self, path: str, key: str) -> str | None:
try:
file_path = Path(path)
content = file_path.read_bytes()
temp_file = self.__get_temp_file(key)
return self.__process_image_data(content, temp_file)
except Exception as ex:
logger.debug(f"File error: {str(ex)}")
return None
async def __process_path(
self,
session: aiohttp.ClientSession,
path: str,
results: dict[str, str | None],
metrics: dict[str, int]
) -> None:
key: str = self.__md5(path)
temp_file: Path = self.__get_temp_file(key)
is_valid_url, _ = self.__is_valid_url(path)
valid_file: bool = self.__is_valid_file(path)
if not (is_valid_url or valid_file) or not self.__is_allowed_type(path):
return
# cache hit
if temp_file.exists():
content: bytes = temp_file.read_bytes()
results[path] = base64.b64encode(content).decode("utf-8")
metrics["total_cached"] += 1
return
result: str | None = await self.__fetch_url(session, path, key) if is_valid_url else await self.__get_file(path, key)
results[path] = result
if result is None:
metrics["total_errors"] += 1
else:
metrics["total_returned"] += 1
async def __get_blobs_async(self, paths: list[str]) -> dict[str, str | None]:
results = {path: None for path in paths}
metrics = {
"total_requested": len(paths),
"total_returned": 0,
"total_errors": 0,
"total_cached": 0
}
async with aiohttp.ClientSession() as session:
# Process tasks in batches of HTTP_THREADS
for i in range(0, len(paths), HTTP_THREADS):
batch_paths = paths[i:i + HTTP_THREADS]
batch_tasks = [
self.__process_path(session, path, results, metrics)
for path in batch_paths
]
await asyncio.gather(*batch_tasks)
logger.info(
f"Found {metrics['total_requested']}, fetched {metrics['total_returned']} "
f"({metrics['total_errors']} errors, {metrics['total_cached']} cached)"
)
return results
def __create_webp_thumbnail(self, image_data: bytes, size: int = 512) -> bytes | None:
img = None
try:
img = Image.open(io.BytesIO(image_data))
width, height = img.size
max_dimension = max(width, height)
if max_dimension > size:
if width > height:
new_width = size
new_height = int(height * (new_width / width))
else:
new_height = size
new_width = int(width * (new_height / height))
img = img.resize((new_width, new_height), Image.LANCZOS)
output = io.BytesIO()
img.save(
output,
format="WEBP",
quality=20,
optimize=True,
method=6 # highest compression
)
return output.getvalue()
except Exception as ex:
logger.error(f"Error creating WebP thumbnail: {str(ex)}\n{traceback.format_exc()}")
return None
finally:
if img is not None:
img.close()
def get_thumbnails(self, paths: list[str]) -> dict[str, str | None]:
"""
Convert URLs or file paths to base64 encoded strings.
Args:
paths: List of URLs or file paths to convert
Returns:
Dictionary mapping paths to their base64 representation or None if failed
"""
assert paths is not None, "paths must be a list[str]"
def run_in_thread():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(self.__get_blobs_async(paths))
finally:
loop.close()
try:
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(run_in_thread)
results = future.result(timeout=5)
# start cleanup in a background thread
cleanup_thread = threading.Thread(target=self.__clean_thumbs_directory)
cleanup_thread.daemon = True
cleanup_thread.start()
return results
except Exception as ex:
logger.error(f"Error fetching thumbnails: {ex}\n{traceback.format_exc()}")
return {path: None for path in paths}
```
--------------------------------------------------------------------------------
/prompts/auditseo.md:
--------------------------------------------------------------------------------
```markdown
# Website SEO Audit Test Instructions
## Query Sequence
### 1. Identify Target Domain & Homepage
**FIRST:** Get available sites and let user choose:
```
webcrawl_sites() - get all available domains
```
**THEN:** Find homepage with sorted URL approach:
```
query: type: html AND url: [target_site_domain]
limit: 1
sites: [target_site_id]
fields: ["content"]
sort: +url
```
**Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
### 2. Get Domain-Specific Structure Overview
Use the arguments collected thus far to query content of representative pages. This is a large set, so keep the API fields empty to reduce tokens.
```
query: type: html AND url: [target_site_domain]
limit: 100
sites: [target_site_id]
fields: []
sort: +url
```
**Purpose:** Get homepage first, then analyze remaining 99 pages from the specified domain only to identify page templates and URL patterns.
### 3. Analyze URL Patterns
From the 100 results, put the homepage ID aside, then identify:
- **Directory distribution:** different directorys indicate site sections
- **URL patterns:** `/`, `/blog/`, `/blog/post/1/`, `/products/product/`, `/feature/title`, etc.
- **Content types:** articles, directories, categories, profiles, press releases, tools
- **Homepage identification:** Look for root domain URLs or shortest paths
If you feel the 100 results are not representative of the website (e.g. 1 homepage and 99 product pages), you can use the same query with RANDOM sort (?) LIMIT 100 to see a sampled page set. Since RANDOM sort leads to nonrepeatable behaviors in the audit, it should be used sparingly, and only when necessary to get a more diverse set of page templates.
### 4. Select Representative Sample
Choose 4 more pages covering a diverse set of templates, identified by unique pattern, prioritized by page count:
Important. You will cycle webpage content one at a time to prevent hitting response size limits.
```
query: id: [page_id]
fields: ["content"]
limit: 1
sites: [target_site_id]
```
If you see the first result page size is manageable, you can try 2 at a time. But don't get greedy.
```
query: id: [page1_id] OR id: [page2_id]
fields: ["content"]
limit: 2
sites: [target_site_id]
```
**Sample selection strategy:**
- 1 homepage (if identifiable)
- 1-2 category pages (blog, products, news)
- 1-3 detail pages (profiles, archives)
- if limited pages, take what you can get
### 5. Analyze Each Page Type
For each sampled page, extract and analyze using the provided analysis framework.
### 6. Offer to Expand or Edit Selected Reference Pages
Upon audit completion, the user may desire to expand the surface area of the test, or audit specific pages. Give them this opportunity as the final word post-report.
### 7. Offer Advanced Analysis or Tool Research
After completing the main audit report, offer the user two additional options:
- **Detailed Analysis:** More comprehensive investigation of specific SEO issues or page types
- **Tool Research:** Research and recommend specific tools to address identified SEO problems
## SEO Elements Analysis Framework
### Title Tag Analysis
**Extract:** `<title>` content
**Check for:**
- **Length:** 30-60 characters optimal (Google displays ~60)
- **Uniqueness:** No duplicate titles across pages
- **Keyword inclusion:** Primary keywords in first 50 characters
- **Brand consistency:** Proper "- NASA" suffix usage
- **Descriptiveness:** Clear, specific page purpose
- **Keyword stuffing:** Excessive keyword repetition
### Meta Description Analysis
**Extract:** `<meta name="description" content="...">`
**Check for:**
- **Length:** 120-158 characters optimal
- **Completeness:** No truncated sentences
- **Uniqueness:** No duplicate descriptions
- **Call-to-action:** Encouraging click-through
- **Keyword relevance:** Natural keyword inclusion
- **Missing descriptions:** Pages without meta descriptions
### Header Structure Analysis
**Extract:** `<h1>`, `<h2>`, `<h3>`, `<h4>`, `<h5>`, `<h6>` tags
**Check for:**
- **H1 uniqueness:** Single H1 per page (SEO best practice)
- **H1 relevance:** Matches title tag intent
- **Logical hierarchy:** Proper H1→H2→H3 structure
- **Keyword optimization:** Headers include relevant keywords naturally
- **Length appropriateness:** Headers not too long/short
- **Missing H1:** Pages without primary headers
### Content Quality Indicators
**Analyze for:**
- **Keyword density:** 1-3% for primary keywords (not stuffing)
- **Content length:** Sufficient depth for topic coverage
- **Readability:** Clear, scannable content structure
- **Internal linking:** Proper cross-references to related NASA content
- **Image alt text:** Descriptive alt attributes (check `<img alt="">`)
- **Duplicate content:** Similar content across multiple pages
### Technical SEO Elements
**Extract and verify:**
- **Canonical URLs:** `<link rel="canonical">`
- **Open Graph tags:** og:title, og:description, og:image
- **Schema markup:** JSON-LD structured data
- **Language declarations:** `<html lang="en">` attributes
- **Mobile viewport:** `<meta name="viewport">` tag
## Common SEO Issues to Identify
### High Priority Issues
1. **Missing H1 tags** or multiple H1s per page
2. **Duplicate title tags** across different pages
3. **Missing meta descriptions** (search engines generate snippets)
4. **Title/description length violations** (truncation in SERPs)
5. **Broken header hierarchy** (H3 before H2, etc.)
### Medium Priority Issues
1. **Generic titles** ("Page Title" or "Untitled")
2. **Keyword stuffing** in titles, descriptions, or headers
3. **Inconsistent brand suffixes** (some pages missing "- NASA")
4. **Overly long headers** (H1 > 70 characters)
5. **Missing alt text** on images
### Low Priority Issues
1. **Suboptimal keyword placement** in headers
2. **Minor length optimizations** for titles/descriptions
3. **Header structure improvements** (adding H2s for better organization)
## Page Type Categorization
### Homepage/Landing Pages
- **Expectation:** Strong H1, compelling meta description, comprehensive title
- **Common issues:** Generic titles, keyword stuffing attempts
### Mission/Technical Pages
- **Expectation:** Technical accuracy, proper header hierarchy for complex content
- **Common issues:** Missing H1s, overly technical meta descriptions
### Blog/News Articles
- **Expectation:** Date relevance, engaging headlines as H1s
- **Common issues:** Duplicate meta descriptions, poor header structure
### Gallery/Media Pages
- **Expectation:** Descriptive titles, image-focused meta descriptions
- **Common issues:** Generic titles like "Image Gallery", missing alt text
### Documentation Pages
- **Expectation:** Clear navigation headers, searchable content
- **Common issues:** Poor hierarchy, missing descriptions
## Reporting Template
### Executive Summary
- **Total pages analyzed:** X pages across Y page types
- **Overall SEO health:** [A-F grade] based on critical issues and optimization opportunities
- **Critical issues requiring immediate attention:** X issues
- **Priority recommendations:** Top 3 actionable improvements
### Detailed Findings by Element
#### Title Tag Issues
- **Pages with optimal titles (30-60 chars):** X% (Y pages)
- **Pages with missing titles:** X pages
- **Pages with duplicate titles:** X pages
- **Pages with keyword stuffing:** X pages
- **Examples of problematic titles:**
- Too long: `[Example Title That Exceeds 60 Characters And Will Be Truncated In Search Results]`
- Too short: `[NASA]`
- Duplicate: `[Same title found on 3 pages]`
#### Meta Description Issues
- **Pages with optimal descriptions (120-158 chars):** X% (Y pages)
- **Pages missing descriptions:** X pages
- **Pages with duplicate descriptions:** X pages
- **Examples of issues:**
- Truncated: `[Description that cuts off mid-sentence in search...]`
- Missing: `[Page URL with no meta description]`
- Duplicate: `[Same description on X pages]`
#### Header Structure Issues
- **Pages with proper H1:** X% (Y pages)
- **Pages with multiple H1s:** X pages
- **Pages with broken hierarchy:** X pages
- **Pages missing H1:** X pages
- **Examples:**
- Multiple H1s: `[Page with H1 "Mission Overview" and H1 "Technical Details"]`
- Broken hierarchy: `[H1→H3 (missing H2)]`
- Missing H1: `[Page URL starting with H2]`
### Page Type Performance Matrix
| Page Type | Sample Size | Title Issues | Description Issues | Header Issues | Overall Grade |
|-----------|-------------|--------------|-------------------|---------------|---------------|
| Homepage | 1 | 0 | 0 | 0 | A |
| Mission Pages | 3 | 1 | 2 | 1 | B- |
| Blog Articles | 3 | 0 | 1 | 2 | C+ |
| Gallery Pages | 2 | 2 | 2 | 1 | D |
| Documentation | 1 | 0 | 0 | 1 | B |
### Quick Wins for Immediate Impact
- **Template updates:** Fix recurring issues at template level (affects multiple pages instantly)
- **Missing meta descriptions:** Add descriptions to pages without them (immediate SERP improvement)
- **Duplicate title resolution:** Update identical titles to be unique and descriptive
- **H1 hierarchy fixes:** Ensure single H1 per page and proper header structure
## What's Next?
You've got a solid foundation with some clear optimization opportunities ahead. Depending on what the audit uncovered, you might be looking at quick wins like title tag improvements, meta description fixes, or header structure cleanup - the kind of changes that can make a real difference with minimal effort.
**Ready to dive deeper?** I can help you:
- **Focus on specific fixes** - Whether it's duplicate content, missing descriptions, or technical SEO gaps, let's tackle your highest-impact items with detailed implementation steps
- **Expand the audit** - Analyze more pages, a single page, or dive into advanced technical elements
- **Research tools** - Find specific solutions for ongoing SEO concerns or content optimization workflows
**What would be most helpful for your next steps?**
## Methodology
You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.
Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/cli.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.utils.cli — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.utils.cli</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.utils.cli</h1><div class="highlight"><pre>
<span></span><span class="k">def</span> <span class="nf">__cli_apply_color</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">code</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">return</span> <span class="sa">f</span><span class="s2">"</span><span class="se">\033</span><span class="s2">[</span><span class="si">{</span><span class="n">code</span><span class="si">}</span><span class="s2">m</span><span class="si">{</span><span class="n">text</span><span class="si">}</span><span class="se">\033</span><span class="s2">[0m"</span>
<span class="k">def</span> <span class="nf">__cli_light_gray</span><span class="p">(</span><span class="n">text</span><span class="p">):</span>
<span class="k">return</span> <span class="n">__cli_apply_color</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="s2">"38;2;130;130;130"</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__cli_gold</span><span class="p">(</span><span class="n">text</span><span class="p">):</span>
<span class="k">return</span> <span class="n">__cli_apply_color</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="s2">"38;2;170;120;0"</span><span class="p">)</span>
<div class="viewcode-block" id="get_help_short_message">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.cli.get_help_short_message">[docs]</a>
<span class="k">def</span> <span class="nf">get_help_short_message</span><span class="p">(</span><span class="n">version</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
<span class="k">return</span> <span class="sa">f</span><span class="s2">"""</span><span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'mcp-server-webcrawl'</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">__cli_light_gray</span><span class="p">(</span><span class="s2">"v"</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">version</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s1">', ©2025 MPL2,'</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'--help'</span><span class="p">)</span><span class="si">}</span><span class="s2"> for more information"""</span></div>
<div class="viewcode-block" id="get_help_long_message">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.cli.get_help_long_message">[docs]</a>
<span class="k">def</span> <span class="nf">get_help_long_message</span><span class="p">(</span><span class="n">version</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
<span class="k">return</span> <span class="sa">f</span><span class="s2">"""A server to connect your web crawls/archives to an LLM via MCP (Model Context Protocol).</span>
<span class="s2">Usage: </span><span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'mcp-server-webcrawl'</span><span class="p">)</span><span class="si">}</span><span class="s2"> [-c </span><span class="se">{{</span><span class="s2">wget,warc,interrobot,katana,siteone</span><span class="se">}}</span><span class="s2">] [-d DATASRC]</span>
<span class="s2">Options:</span>
<span class="s2"> -c, --crawler Specify which crawler to use</span>
<span class="s2"> -d, --datasrc Path to datasrc (required unless testing)</span>
<span class="s2"> -h, --help Show this help message and exit</span>
<span class="s2"> -i, --interactive Run interactive terminal search</span>
<span class="s2">Where is my DATASRC?</span>
<span class="s2"> archivebox Directory above one or more archivebox init'ed dirs</span>
<span class="s2"> httrack Projects directory (~/websites/, /My Websites/)</span>
<span class="s2"> interrobot Path to */interrobot.v2.db</span>
<span class="s2"> katana Directory containing the webroot archives</span>
<span class="s2"> siteone Directory containing the webroot archives</span>
<span class="s2"> (requires archive option)</span>
<span class="s2"> warc Directory containing WARC files</span>
<span class="s2"> wget Directory containing the webroot archives</span>
<span class="s2"> [DATASRC]</span>
<span class="s2"> ╭─────────────────────────────────╮</span>
<span class="s2"> ✧───────────────────────✧ ✧───────────────────────✧</span>
<span class="s2"> ╱ example.com (webroot) ╱ ╱ pragmar.com (webroot) ╱</span>
<span class="s2"> ✧───────────────────────✧ ✧───────────────────────✧</span>
<span class="s2">MCP Configuration Example:</span>
<span class="se">{{</span><span class="s2">"mcpServers": </span><span class="se">{{</span>
<span class="s2"> "wget": </span><span class="se">{{</span>
<span class="s2"> "command": "/path/to/mcp-server-webcrawl",</span>
<span class="s2"> "args": ["--crawler", "wget", "--datasrc",</span>
<span class="s2"> "/path/to/archived/hosts/"]</span><span class="se">}}</span>
<span class="s2"> </span><span class="se">}}</span>
<span class="se">}}</span>
<span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'mcp-server-webcrawl'</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">__cli_light_gray</span><span class="p">(</span><span class="s2">"v"</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">version</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s1">', ©2025 MPL2'</span><span class="p">)</span><span class="si">}</span>
<span class="s2">https://github.com/pragmar/mcp-server-webcrawl</span>
<span class="s2">"""</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/prompts/auditfiles.md:
--------------------------------------------------------------------------------
```markdown
# Website File Type Audit Instructions
## Query Sequence
### 1. Identify Target Domain & Homepage
**FIRST:** Get available sites and let user choose:
```
webcrawl_sites() - get all available domains
```
**THEN:** Find homepage or directory index with sorted URL approach:
```
query: type: html AND url: [target_site_domain]
limit: 1
sites: [target_site_id]
sort: +url
```
**Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
### 2. Core File Type Analysis
Run separate queries for high-volume file types to get accurate counts and understand scale:
**HTML Pages:**
```
query: type: html
limit: 100
sites: [target_site_id]
fields: ["size"]
```
**Images:**
```
query: type: img
limit: 100
sites: [target_site_id]
fields: ["size"]
```
**JavaScript Files:**
```
query: type: script
limit: 100
sites: [target_site_id]
fields: ["size"]
```
**CSS Stylesheets:**
```
query: type: style
limit: 100
sites: [target_site_id]
fields: ["size"]
```
### 3. Specialized File Types
Combine lower-volume file types in grouped queries:
**Media & Interactive:**
```
query: type: audio OR type: video OR type: iframe OR type: font OR type: text OR type: rss OR type: other
limit: 100
sites: [target_site_id]
fields: ["size"]
sort: +id
```
### 4. Internal vs External Asset Analysis
If any file type shows 100+ results, segment by domain to understand hosting strategy:
**Internal Assets (same domain):**
```
query: type: [file_type] AND url: [target_site_domain]
limit: 100
sites: [target_site_id]
fields: ["size"]
```
**External Assets (CDNs, third-party):**
```
query: type: [file_type] AND NOT url: [target_site_domain]
limit: 100
sites: [target_site_id]
fields: ["size"]
```
**Apply this segmentation to:** HTML, images, scripts, and styles if they exceed 100 results. Note total result counts, but use response results as a representative sample as you will not be able to analyze all resources of large sites.
### 5. Asset Distribution Mapping
From the results, extract domain patterns for external assets:
- **CDN domains:** `cdn.`, `static.`, `assets.`, `media.`
- **Third-party services:** Google Fonts, jQuery CDN, analytics
- **Subdomain strategy:** Different subdomains for different asset types
### 6. Offer Advanced Analysis or Tool Research
After completing the main audit report, offer the user two additional options:
- **Detailed Analysis:** More comprehensive investigation of specific file types, asset organization patterns, or optimization opportunities
- **Tool Research:** Research and recommend specific tools to address identified file management and optimization issues
## File Type Analysis Framework
### HTML Analysis
**Metrics to extract:**
- **Total pages:** Count of HTML files (use result totals)
- **Segment by directory/path:** Count files by URL segments
- **URL structure patterns:** Directory organization insights
### Images Analysis
**Metrics to extract:**
- **Total images:** Count and estimated storage impact
- **Format distribution:** JPG, PNG, SVG, GIF, WebP usage
- **Hosting strategy:** Self-hosted vs CDN distribution
- **Directory patterns:** `/images/`, `/media/`, organized structure
- **Optimization indicators:** Large files, legacy formats
### JavaScript Analysis
**Metrics to extract:**
- **Script count:** Total JS files and hosting distribution
- **Library identification:** jQuery, React, analytics scripts
- **Bundle strategy:** Many small files vs consolidated bundles
- **Third-party dependencies:** External library usage
- **Performance patterns:** Blocking vs async loading indicators
### CSS Architecture Analysis
**Metrics to extract:**
- **Stylesheet count:** Total CSS files and organization
- **Framework usage:** Bootstrap, Foundation, custom frameworks
- **Asset delivery:** Inline vs external, CDN usage
- **File size distribution:** Large framework files vs custom styles
### Media & Interactive Content
**Metrics to extract:**
- **Video/Audio:** Count, hosting strategy, streaming vs download
- **Fonts:** Font names, and combined size (w/ italic, bold variants)
- **RSS Feeds:** Check for existence
## Asset Strategy Analysis
### Third-Party CDNs
**Scope:** External domains (cdnjs, jsdelivr, unpkg, Google)
- External dependency management
- Performance vs reliability trade-offs
- Popular library and framework delivery
### Content Distribution Analysis
- **Asset sizes:** Images, scripts, and styles within reasonable range
- **Asset consolidation score:** How well assets are organized
- **Performance optimization:** CDN usage effectiveness
- **Dependency risk:** External service reliability
- **Maintenance complexity:** Multi-domain asset management
## Common File Organization Issues
### High Priority Issues
1. **Oversized assets:** Images >2MB, JS bundles >500KB, CSS files >200KB
2. **Legacy format usage:** GIF animations, uncompressed images, outdated JS libraries
3. **Asset sprawl:** Files scattered across multiple domains without clear strategy
4. **Missing CDN usage:** Large assets served from main domain affecting performance
5. **Orphaned files:** Assets not referenced by any HTML pages
### Medium Priority Issues
1. **Suboptimal file formats:** JPG for graphics, PNG for photos, missing WebP adoption
2. **Bundle fragmentation:** Many small JS/CSS files instead of optimized bundles
3. **Mixed hosting strategy:** Inconsistent use of CDNs vs self-hosting
4. **Outdated dependencies:** Legacy jQuery versions, unused framework components
5. **Poor directory organization:** Assets without logical folder structure
### Low Priority Issues
1. **Minor optimization opportunities:** Slightly oversized images, redundant CSS
2. **Naming convention inconsistencies:** Mixed file naming patterns
3. **Cache header optimization:** Suboptimal asset caching strategies
## Reporting Template
### Executive File Type Summary
| File Type | Internal Count | External Count | Total Count | Primary Hosting | Optimization Status |
|-----------|---------------|----------------|-------------|-----------------|-------------------|
| HTML | X | Y | Z | Main Domain | ✅ Well Organized |
| Images | X | Y | Z | CDN/Mixed | ⚠️ Needs Optimization |
| JavaScript | X | Y | Z | Mixed | ⚠️ Bundle Opportunity |
| CSS | X | Y | Z | Main Domain | ✅ Good Structure |
| Media | X | Y | Z | External | ✅ Proper CDN Use |
| Fonts | X | Y | Z | Google Fonts | ✅ Performance Optimized |
| Other | X | Y | Z | Mixed | ℹ️ Review Needed |
### Asset Architecture Health Score
- **Overall Grade:** [A-F] based on organization and optimization
- **Total Assets:** X files across Y domains
- **Hosting Strategy:** [Optimized | Mixed | Needs Improvement]
- **Performance Impact:** [Low | Medium | High] based on asset distribution
### Detailed File Type Analysis
#### HTML Content Structure
- **Total Pages:** X HTML files
- **Content Freshness:** Y% updated in last 6 months
- **URL Organization:** [Excellent | Good | Needs Structure]
- **Domain Strategy:** [Single | Multi-subdomain | Complex]
**Representative URL Patterns:**
- Root pages: `/`, `/about`, `/contact`
- Content sections: `/blog/`, `/products/`, `/docs/`
- Deep content: `/category/subcategory/page/`
#### Image Asset Distribution
- **Total Images:** X files (estimated Y MB)
- **Format Breakdown:** Z% JPG, W% PNG, V% SVG
- **Hosting Distribution:** U% internal, T% CDN
- **Optimization Opportunities:** S large files identified
**Asset Organization Patterns:**
- Well-organized: `/images/category/filename.ext`
- Mixed organization: Various directory structures
- Needs improvement: Files scattered across domains
#### JavaScript Architecture
- **Total Scripts:** X files
- **Library Dependencies:** jQuery (Y), React (Z), Analytics (W)
- **Bundle Strategy:** [Optimized | Moderate | Fragmented]
- **Third-party Usage:** V% external dependencies
**Performance Indicators:**
- Large bundles: Files >100KB identified
- Legacy libraries: Outdated framework versions
- Loading strategy: Async/defer usage analysis
#### CSS Organization
- **Total Stylesheets:** X files
- **Framework Usage:** Bootstrap, custom themes identified
- **File Size Distribution:** Largest Y KB, average Z KB
- **Delivery Strategy:** [Optimized | Standard | Needs Work]
**Architecture Assessment:**
- Modular approach: Well-separated concerns
- Monolithic: Few large files
- Fragmented: Many small files without clear organization
### Asset Hosting Strategy Analysis
#### Domain Performance Matrix
| Domain Type | Example | Asset Types | Count | Performance Impact | Recommendation |
|-------------|---------|-------------|-------|-------------------|----------------|
| Main Domain | example.com | HTML, some CSS/JS | X | Baseline | Maintain current |
| Asset Subdomain | static.example.com | Images, CSS, JS | Y | Optimized | ✅ Best practice |
| Third-party CDN | cdnjs.cloudflare.com | Libraries | Z | Fast but dependent | Monitor reliability |
| External Services | fonts.googleapis.com | Web fonts | W | Good performance | ✅ Appropriate use |
#### Priority Matrix
1. **Critical (Fix Immediately):** Oversized assets affecting performance, missing critical files
2. **High (Fix This Sprint):** Legacy formats, asset sprawl, poor CDN utilization
3. **Medium (Next Quarter):** Bundle optimization, directory organization, format modernization
4. **Low (Backlog):** Minor optimizations, naming conventions, cache tuning
## What's Next?
Your asset audit reveals optimization opportunities across performance, organization, and maintenance. The biggest wins typically come from addressing oversized assets (images >2MB, JS >500KB), implementing CDN strategies, and consolidating fragmented bundles.
**Ready to optimize?** I can help you:
- **Prioritize critical fixes** - Focus on your highest-impact performance bottlenecks with specific implementation strategies and expected performance gains
- **Research optimization tools** - Find monitoring, bundling, and CDN solutions that fit your development workflow and technical constraints
- **Plan architecture improvements** - Design sustainable asset organization and delivery strategies for long-term maintainability
**What would be most helpful for your next steps?**
## Methodology
You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.
Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/tests.py:
--------------------------------------------------------------------------------
```python
import unittest
from mcp_server_webcrawl.utils.search import SearchQueryParser, SearchSubquery
class TestSearchQueryParser(unittest.TestCase):
def setUp(self):
"""
Set up a parser instance for each test
"""
self.parser = SearchQueryParser()
def test_simple_term(self):
"""
Simple single term search
"""
query = "hello"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, None)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].type, "term")
self.assertEqual(result[0].operator, None)
def test_quoted_phrase(self):
"""
Quoted phrase search
"""
query = '"hello world"'
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, None)
self.assertEqual(result[0].value, "hello world")
self.assertEqual(result[0].type, "phrase")
def test_wildcard_term(self):
"""
Wildcard term search
"""
query = "search*"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, None)
self.assertEqual(result[0].value, "search")
self.assertEqual(result[0].type, "wildcard")
def test_field_term(self):
"""
Field-specific term search
"""
query = "url:example.com"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, "url")
self.assertEqual(result[0].value, "example.com")
self.assertEqual(result[0].type, "term")
def test_field_numeric(self):
"""
Field with numeric value
"""
query = "status:404"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, "status")
self.assertEqual(result[0].value, 404)
self.assertEqual(result[0].type, "term")
def test_field_quoted(self):
"""
Field with quoted value
"""
query = 'content:"hello world"'
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, "content")
self.assertEqual(result[0].value, "hello world")
self.assertEqual(result[0].type, "phrase")
def test_field_wildcard(self):
"""
Field with wildcard value
"""
query = "url:example*"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, "url")
self.assertEqual(result[0].value, "example")
self.assertEqual(result[0].type, "wildcard")
def test_simple_and(self):
"""
Simple AND query
"""
query = "hello AND world"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 2)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].value, "world")
self.assertEqual(result[1].operator, None)
def test_simple_or(self):
"""
Simple OR query
"""
query = "hello OR world"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 2)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].operator, "OR")
self.assertEqual(result[1].value, "world")
self.assertEqual(result[1].operator, None)
def test_simple_not(self):
"""
Simple NOT query
"""
query = "NOT hello"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].value, "hello")
self.assertTrue('NOT' in result[0].modifiers)
def test_and_with_fields(self):
"""
AND with field specifiers
"""
query = "content:hello AND url:example.com"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 2)
self.assertEqual(result[0].field, "content")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].field, "url")
def test_or_with_fields(self):
"""
OR with field specifiers
"""
query = "status:404 OR status:500"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 2)
self.assertEqual(result[0].field, "status")
self.assertEqual(result[0].value, 404)
self.assertEqual(result[0].operator, "OR")
self.assertEqual(result[1].field, "status")
self.assertEqual(result[1].value, 500)
def test_not_with_field(self):
"""
NOT with field specifier
"""
query = "NOT status:404"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 1)
self.assertEqual(result[0].field, "status")
self.assertEqual(result[0].value, 404)
self.assertTrue('NOT' in result[0].modifiers)
def test_simple_parentheses(self):
"""
Simple expression with parentheses
"""
query = "(hello AND world)"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 2)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].value, "world")
self.assertEqual(result[1].operator, None)
def test_complex_parentheses(self):
"""
Complex expression with nested parentheses
"""
query = "(hello AND (world OR planet))"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 3)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].value, "world")
self.assertEqual(result[1].operator, "OR")
self.assertEqual(result[2].value, "planet")
self.assertEqual(result[2].operator, None)
def test_mixed_operators(self):
"""
Query with mixed operators
"""
query = "hello AND world OR planet"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 3)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].value, "world")
self.assertEqual(result[1].operator, "OR")
self.assertEqual(result[2].value, "planet")
self.assertEqual(result[2].operator, None)
def test_mixed_with_parentheses(self):
"""
Mixed operators with parentheses for precedence
"""
query = "hello AND (world OR planet)"
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 3)
self.assertEqual(result[0].value, "hello")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].value, "world")
self.assertEqual(result[1].operator, "OR")
self.assertEqual(result[2].value, "planet")
self.assertEqual(result[2].operator, None)
def test_complex_nested_query(self):
"""
Complex nested query with multiple operators
"""
query = '(content:"error message" AND (status:404 OR status:500)) AND NOT url:example.com'
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 4)
self.assertEqual(result[0].field, "content")
self.assertEqual(result[0].value, "error message")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].field, "status")
self.assertEqual(result[1].value, 404)
self.assertEqual(result[1].operator, "OR")
self.assertEqual(result[2].field, "status")
self.assertEqual(result[2].value, 500)
self.assertEqual(result[2].operator, "NOT")
self.assertEqual(result[3].field, "url")
self.assertEqual(result[3].value, "example.com")
self.assertEqual(result[3].operator, None)
def test_all_features_combined(self):
"""
Comprehensive test with all features combined
"""
query = 'content:"critical error" AND (status:500 OR type:html) AND NOT url:example* AND size:1024'
result: SearchSubquery= self.parser.parse(query)
self.assertEqual(len(result), 5)
self.assertEqual(result[0].field, "content")
self.assertEqual(result[0].value, "critical error")
self.assertEqual(result[0].type, "phrase")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].field, "status")
self.assertEqual(result[1].value, 500)
self.assertEqual(result[1].operator, "OR")
self.assertEqual(result[2].field, "type")
self.assertEqual(result[2].value, "html")
self.assertEqual(result[2].operator, "AND")
self.assertEqual(result[3].field, "url")
self.assertEqual(result[3].value, "example")
self.assertEqual(result[3].type, "wildcard")
self.assertTrue('NOT' in result[3].modifiers)
self.assertEqual(result[3].operator, "AND")
self.assertEqual(result[4].field, "size")
self.assertEqual(result[4].value, 1024)
self.assertEqual(result[4].operator, None)
def test_to_sqlite_fts(self):
"""
Test conversion to SQLite FTS format
"""
query = 'content:"error" AND status:404'
result: SearchSubquery= self.parser.parse(query)
query_parts, params = self.parser.to_sqlite_fts(result)
self.assertEqual(len(query_parts), 3)
self.assertEqual(query_parts[0], "ResourcesFullText.Content MATCH :query0")
self.assertEqual(query_parts[1], "AND")
self.assertEqual(query_parts[2], "Resources.Status = :query1")
self.assertEqual(len(params), 2)
self.assertEqual(params["query0"], '"error"')
self.assertEqual(params["query1"], 404)
def test_operator_assignment_bug(self):
"""
Test that exposes the double operator assignment bug.
Query: "term1 AND term2 OR term3" should create:
[term1(op=AND), term2(op=OR), term3(op=None)]
Were the bug present, term3 would incorrectly get operator == OR
"""
from mcp_server_webcrawl.utils.parser import SearchLexer, SearchParser
lexer = SearchLexer()
parser = SearchParser(lexer)
query = "term1 AND term2 OR term3"
result = parser.parser.parse(query, lexer=lexer.lexer)
self.assertEqual(len(result), 3)
self.assertEqual(result[0].value, "term1")
self.assertEqual(result[0].operator, "AND")
self.assertEqual(result[1].value, "term2")
self.assertEqual(result[1].operator, "OR")
self.assertEqual(result[2].value, "term3")
self.assertEqual(result[2].operator, None)
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />
<script src="../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../_static/doctools.js?v=888ff710"></script>
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers</h1><div class="highlight"><pre>
<span></span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="kn">import</span> <span class="n">FIXTURES_DIRECTORY</span>
<span class="n">VALID_CRAWLER_CHOICES</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"archivebox"</span><span class="p">,</span> <span class="s2">"httrack"</span><span class="p">,</span> <span class="s2">"interrobot"</span><span class="p">,</span> <span class="s2">"katana"</span><span class="p">,</span> <span class="s2">"siteone"</span><span class="p">,</span> <span class="s2">"warc"</span><span class="p">,</span> <span class="s2">"wget"</span><span class="p">]</span>
<div class="viewcode-block" id="get_fixture_directory">
<a class="viewcode-back" href="../../mcp_server_webcrawl.crawlers.html#mcp_server_webcrawl.crawlers.get_fixture_directory">[docs]</a>
<span class="k">def</span> <span class="nf">get_fixture_directory</span><span class="p">()</span> <span class="o">-></span> <span class="n">Path</span><span class="p">:</span>
<span class="c1"># only to be used for devs on test runs, configured in settings_local.py</span>
<span class="c1"># settings_local.py added as sibling of settings.py if not present</span>
<span class="c1"># download https://github.com/pragmar/mcp-server-webcrawl-fixtures</span>
<span class="k">assert</span> <span class="n">FIXTURES_DIRECTORY</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">FIXTURES_DIRECTORY</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> \
<span class="sa">f</span><span class="s2">"Fixtures not configured in settings_local.py, or is not a valid directory.</span><span class="se">\n</span><span class="s2">FIXTURES_DIRECTORY: </span><span class="si">{</span><span class="n">FIXTURES_DIRECTORY</span><span class="si">}</span><span class="s2">"</span>
<span class="k">return</span> <span class="n">FIXTURES_DIRECTORY</span></div>
<div class="viewcode-block" id="get_crawler">
<a class="viewcode-back" href="../../mcp_server_webcrawl.crawlers.html#mcp_server_webcrawl.crawlers.get_crawler">[docs]</a>
<span class="k">def</span> <span class="nf">get_crawler</span><span class="p">(</span><span class="n">crawler_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> lazy load crawler, some classes have additional package dependencies</span>
<span class="sd"> """</span>
<span class="k">if</span> <span class="n">crawler_name</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">crawler_name</span> <span class="o">=</span> <span class="n">crawler_name</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
<span class="k">if</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"archivebox"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.archivebox.crawler</span> <span class="kn">import</span> <span class="n">ArchiveBoxCrawler</span>
<span class="k">return</span> <span class="n">ArchiveBoxCrawler</span>
<span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"httrack"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.httrack.crawler</span> <span class="kn">import</span> <span class="n">HtTrackCrawler</span>
<span class="k">return</span> <span class="n">HtTrackCrawler</span>
<span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"interrobot"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.interrobot.crawler</span> <span class="kn">import</span> <span class="n">InterroBotCrawler</span>
<span class="k">return</span> <span class="n">InterroBotCrawler</span>
<span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"katana"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.crawler</span> <span class="kn">import</span> <span class="n">KatanaCrawler</span>
<span class="k">return</span> <span class="n">KatanaCrawler</span>
<span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"siteone"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.siteone.crawler</span> <span class="kn">import</span> <span class="n">SiteOneCrawler</span>
<span class="k">return</span> <span class="n">SiteOneCrawler</span>
<span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"warc"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.crawler</span> <span class="kn">import</span> <span class="n">WarcCrawler</span>
<span class="k">return</span> <span class="n">WarcCrawler</span>
<span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"wget"</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
<span class="k">return</span> <span class="n">WgetCrawler</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">valid_choices</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">VALID_CRAWLER_CHOICES</span><span class="p">)</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">"unsupported crawler '</span><span class="si">{</span><span class="n">crawler_name</span><span class="si">}</span><span class="s2">' (</span><span class="si">{</span><span class="n">valid_choices</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/prompts/auditperf.md:
--------------------------------------------------------------------------------
```markdown
# Web Performance Detective
## Query Sequence
### 1. Identify Target Domain & Homepage
**FIRST:** Get available sites and let user choose:
```
webcrawl_sites() - get all available domains
```
**THEN:** Find homepage with sorted URL approach:
```
query: type: html AND url: [target_site_domain]
limit: 1
sites: [target_site_id]
fields: ["content"]
sort: +url
```
**Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
**Parse homepage for embedded assets:**
- `<style>` blocks (inline CSS)
- `<script>` blocks (inline JS)
- `<link rel="stylesheet">` references
- `<script src="">` references
- CSS `@import` statements
- Performance-critical patterns in HTML
### 2. Site-Wide CSS Analysis
**Primary CSS Query:**
```
query: type: style AND url: [target_site_domain]
limit: 100
sites: [target_site_id]
fields: []
sort: +url
```
**If 0 results, try Asset Domain Discovery:**
```
query: type: style
limit: 100
sites: [target_site_id]
fields: []
sort: +url
```
**Then filter results for common asset domain patterns:**
- `static.[domain]` (static.nasa.gov)
- `cdn.[domain]` (cdn.nasa.gov)
- `assets.[domain]` (assets.nasa.gov)
- `media.[domain]` (media.nasa.gov)
- `img.[domain]` or `images.[domain]`
- `js.[domain]` or `css.[domain]`
- `files.[domain]` or `downloads.[domain]`
- Third-party CDNs (cdnjs.cloudflare.com, jsdelivr.net, unpkg.com)
**Asset Domain Detection Logic:**
1. Extract all unique domains from CSS file URLs
2. Score domains by asset-hosting likelihood:
- Contains "static", "cdn", "assets", "media" = High
- Subdomain of main site = Medium
- Third-party CDN = Medium
- Same as main domain = Low (already checked)
3. Use highest-scoring domain as asset domain
**Analyze each CSS file for:**
- `@import` usage (render-blocking)
- `!important` overuse (specificity issues)
- Unused vendor prefixes
- Large file sizes
- Duplicate rules across files
### 3. Site-Wide JavaScript Analysis
**Primary JS Query:**
```
query: type: script AND url: [target_site_domain]
limit: 100
sites: [target_site_id]
fields: []
sort: +url
```
**If 0 results, use discovered asset domain:**
```
query: type: script AND url: [discovered_asset_domain]
limit: 100
sites: [target_site_id]
fields: []
sort: +url
```
**If still 0 results, broad asset discovery:**
```
query: type: script
limit: 100
sites: [target_site_id]
fields: []
sort: +url
```
**Then apply same domain filtering logic as CSS**
**Analyze each JS file for:**
- `document.getElementById` (inefficient DOM queries)
- jQuery usage patterns
- Blocking script patterns
- Large library imports
- Performance anti-patterns
### 4. Cross-Reference & Dependency Mapping
**Find render-blocking resources:**
```
query: type: html AND (content: "<link rel=\"stylesheet\"" OR content: "<script src=") AND url: [target_site_domain]
limit: 50
sites: [target_site_id]
fields: []
sort: +url
```
**Map critical rendering path dependencies**
### 5. Offer Advanced Analysis or Tool Research
After completing the main audit report, offer the user two additional options:
- **Detailed Analysis:** More comprehensive investigation of specific performance bottlenecks, asset optimization opportunities, or advanced performance patterns
- **Tool Research:** Research and recommend specific tools to address identified performance issues and implement monitoring solutions
## Performance Anti-Pattern Detection
### CSS Performance Issues
#### Critical Render-Blocking Patterns
- **@import usage:** Delays CSS parsing
- **Excessive !important:** Indicates poor CSS architecture
- **Large CSS files:** >100KB uncompressed
- **Unused CSS:** High selector count vs actual usage
- **CSS-in-JS:** React/Vue component styles
#### Specificity & Maintainability Issues
- **ID selectors overuse:** High specificity conflicts
- **Deep nesting:** >4 levels indicates complexity
- **Vendor prefix bloat:** Outdated browser support
- **Duplicate declarations:** Maintenance overhead
### JavaScript Performance Issues
#### DOM Manipulation Anti-Patterns
- **document.getElementById in loops:** Performance killer
- **jQuery chaining abuse:** Memory leaks potential
- **No event delegation:** Too many event listeners
- **Synchronous AJAX:** Blocking user interaction
#### Loading & Execution Issues
- **Blocking scripts in `<head>`:** Delays page rendering
- **Large library imports:** jQuery, Lodash entire libraries
- **Polyfill overuse:** Unnecessary for modern browsers
- **No async/defer attributes:** Blocking HTML parsing
## Asset Segmentation Strategy
### Asset Domain Classification
#### Main Domain Assets
**Scope:** `[target_site_domain]` - same domain as website
- Self-hosted assets on primary domain
- Often includes basic CSS/JS for small sites
#### Asset Domain Assets
**Scope:** `static.[domain]`, `cdn.[domain]`, `assets.[domain]`, etc.
- Dedicated asset subdomains for performance
- Usually contains bulk of CSS/JS files
- Better caching and CDN optimization
#### Third-Party Assets
**Scope:** External CDNs and services
- `cdnjs.cloudflare.com`, `jsdelivr.net`, `unpkg.com`
- Google Fonts, jQuery CDN, Bootstrap CDN
- Analytics, tracking, and widget scripts
#### Asset Discovery Strategy
1. **Primary search:** Target main domain first
2. **Asset domain discovery:** If 0 results, scan all domains for asset patterns
3. **Domain scoring:** Rank by likelihood of hosting assets
4. **Fallback analysis:** Use highest-scoring asset domain
### Homepage-Specific Assets
**Scope:** Assets only loaded on homepage
- **Inline styles:** `<style>` blocks in homepage HTML
- **Inline scripts:** `<script>` blocks in homepage HTML
- **Homepage-only CSS:** Files referenced only by homepage
- **Homepage-only JS:** Files referenced only by homepage
**Analysis Focus:**
- Critical CSS identification
- Above-the-fold optimization
- Homepage-specific performance bottlenecks
### Site-Global Assets
**Scope:** Assets loaded across multiple pages (any domain)
- **Global stylesheets:** Referenced by >1 page
- **Framework CSS:** Bootstrap, Foundation, custom frameworks
- **Global JavaScript:** Site-wide functionality
- **Third-party libraries:** Analytics, tracking, widgets
**Analysis Focus:**
- Caching optimization opportunities
- Bundle size optimization
- Progressive loading strategies
### Page-Type Specific Assets
**Scope:** Assets for specific page categories
- **Blog-specific:** Article styling, commenting systems
- **Gallery-specific:** Image viewers, slideshow libraries
- **Form-specific:** Validation libraries, UI components
## Common Performance Issues
### High Priority Issues
1. **Render-blocking CSS/JS:** Assets that delay initial page rendering
2. **@import usage:** CSS imports that create dependency chains
3. **Synchronous JavaScript:** Blocking scripts that prevent HTML parsing
4. **Oversized assets:** CSS >200KB or JS >500KB affecting load times
5. **Missing async/defer:** JavaScript without proper loading attributes
### Medium Priority Issues
1. **jQuery dependency:** Legacy library usage for simple DOM operations
2. **Unused CSS/JS:** Large files with low utilization rates
3. **Vendor prefix bloat:** Outdated browser support adding file size
4. **Inefficient DOM queries:** Performance-killing selection patterns
5. **Missing CDN usage:** Large assets served from main domain
### Low Priority Issues
1. **CSS specificity wars:** Excessive !important usage indicating architectural issues
2. **Minor bundle optimization:** Small files that could be combined
3. **Cache optimization opportunities:** Suboptimal asset caching strategies
## Reporting Template
### 📊 Executive Performance Summary
| Metric | Value | Status |
|--------|-------|--------|
| **Total Assets Analyzed** | X CSS, Y JS files | Based on crawl results |
| **Critical Issues** | X render-blocking resources | Immediate attention needed |
| **Optimization Potential** | Estimated X% improvement | Conservative estimate |
| **Performance Grade** | [A-F] | Based on issue severity |
### 🏗️ Asset Distribution Analysis
| Domain Type | Example | Asset Count | Total Size | Performance Notes |
|-------------|---------|-------------|------------|-------------------|
| Main Domain | example.com | X CSS, Y JS | Z KB | Self-hosted control |
| Asset Subdomain | static.example.com | X CSS, Y JS | Z KB | Optimized delivery |
| Third-party CDN | cdnjs.cloudflare.com | X CSS, Y JS | Z KB | External dependency |
| **Totals** | - | **X CSS, Y JS** | **Z KB** | **[Strategy Assessment]** |
### ⚡ Critical Rendering Path Analysis
| Asset Type | Domain | Count | Size | Blocking Impact | Priority |
|------------|--------|-------|------|-----------------|----------|
| Inline CSS | Main | X | Y KB | @import usage | High |
| External CSS | Asset/CDN | X | Y KB | Render-blocking | High |
| Inline JS | Main | X | Y KB | DOM queries | Medium |
| External JS | Asset/CDN | X | Y KB | jQuery patterns | Medium |
### 🔍 Performance Anti-Pattern Detection
| Issue Type | Occurrences | Impact Level | Root Cause |
|------------|-------------|--------------|------------|
| @import Usage | X files | Critical | CSS dependency chains |
| Blocking Scripts | Y files | High | Missing async/defer |
| Oversized Assets | Z files | Medium | Bundle optimization needed |
| jQuery Dependencies | W files | Low | Legacy library usage |
### 🎯 Asset Architecture Health
| Metric | Current | Benchmark | Status |
|--------|---------|-----------|--------|
| Total CSS Size | X KB | <200KB | ✅/⚠️/❌ |
| Total JS Size | Y KB | <500KB | ✅/⚠️/❌ |
| Asset Domains | Z domains | 2-3 optimal | ✅/⚠️/❌ |
| Render Blockers | W resources | <3 critical | ✅/⚠️/❌ |
## What's Next?
Your performance audit reveals the current state of your asset delivery strategy and identifies the biggest opportunities for improvement. Whether you're dealing with render-blocking resources, oversized bundles, or architectural complexity, most performance gains come from addressing the highest-impact patterns first.
**Ready to optimize?** I can help you:
- **Focus on critical fixes** - Let's prioritize your specific performance bottlenecks and create detailed optimization strategies, including implementation approaches and expected performance gains
- **Expand the technical analysis** - Examine dependency chains, analyze Core Web Vitals impact, or investigate advanced optimization techniques like critical CSS extraction and progressive loading
- **Research performance tools** - Find the right monitoring, bundling, and optimization solutions that fit your development workflow and technical constraints
**What would be most helpful for your next steps?**
## Methodology
You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.
Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/help.py:
--------------------------------------------------------------------------------
```python
import curses
import textwrap
from typing import TYPE_CHECKING
from mcp_server_webcrawl.interactive.views.base import CONTENT_MARGIN
from mcp_server_webcrawl.interactive.views.base import BaseCursesView
from mcp_server_webcrawl.interactive.ui import ThemeDefinition
from mcp_server_webcrawl.interactive.ui import safe_addstr
if TYPE_CHECKING:
from mcp_server_webcrawl.interactive.session import InteractiveSession
INTERROBOT_LINK: str = "<https://interro.bot>"
HELP_CONTENT: str = """Boolean Search Syntax
The query engine supports field-specific (`field: value`) searches and complex boolean expressions. Fulltext is supported as a combination of the url, content, and headers fields.
Example Queries
| Query Example | Description |
|------------------------------|---------------------------------------|
| privacy | Fulltext single keyword match |
| "privacy policy" | Fulltext exact phrase match |
| boundar* | Fulltext wildcard (boundary, |
| | boundaries, etc.) |
| id: 12345 | Match specific resource by ID |
| url: example.com/dir | URL contains example.com/dir |
| type: html | HTML pages only |
| status: 200 | HTTP status equals 200 |
| status: >=400 | HTTP status >= 400 |
| content: h1 | Content contains h1 |
| headers: text/xml | Headers contain text/xml |
| privacy AND policy | Fulltext matches both terms |
| privacy OR policy | Fulltext matches either term |
| policy NOT privacy | Policy but not privacy |
| (login OR signin) AND form | Login/signin with form |
| type: html AND status: 200 | HTML pages with HTTP success |
Field Reference
`id`: Resource identifier (integer)
- Example: id: 12345
`url`: URL field matching
- Supports partial matches and wildcards
- Example: `url: example.com/about`
- Example: `url: *.pdf`
`type`: Resource type filtering
- Common types: html, img, script, style, font, audio, video, pdf, doc
- Example: `type: html`
- Example: `type: img`
`status`: HTTP status code
- Supports exact matches and comparisons
- Example: `status: 200`
- Example: `status: >=400`
- Example: `status: <300`
`content`: Full-text search within resource content
- Searches the actual content/body of resources
- Example: `content: "user login"`
- Example: `content: javascript`
`headers`: HTTP response headers search
- Searches within response headers
- Example: `headers: application/json`
- Example: `headers: gzip`
Boolean Operators
`AND`: Both terms must be present
- Example: `privacy AND policy`
- Example: `type: html AND status: 200`
`OR`: Either term can be present
- Example: `login OR signin`
- Example: `type: img OR type: video`
`NOT`: Exclude documents containing the term
- Example: `policy NOT privacy`
- Example: `type: html NOT status: 404`
`Parentheses`: Group expressions
- Example: `(login OR signin) AND (form OR page)`
- Example: `type: html AND (status: 200 OR status: 301)`
Wildcards
`Suffix wildcard` (*): Matches terms starting with the prefix
- Example: `admin*` matches admin, administrator, administration
- Example: `java*` matches java, javascript, javadoc
Tips
- Use quotes for exact phrase matching: `"privacy policy"`
- Combine field searches with fulltext: `type: html AND privacy`
- Use wildcards for partial matches: `admin*`
- Group complex expressions with parentheses
- Field names are case-sensitive, values are case-insensitive
- Whitespace around operators is optional: `A AND B` = `A AND B`
If you enjoy mcp-server-webcrawl --interactive, you will almost assuredly appreciate InterroBot crawler and analyzer <https://interro.bot>, by the same developer."""
class HelpView(BaseCursesView):
"""
Interactive help view displaying scrollable documentation.
"""
def __init__(self, session: 'InteractiveSession'):
"""
Initialize the help view.
Args:
session: The interactive session instance
"""
super().__init__(session)
self._focused = True
self.__scroll_offset: int = 0
self.__cached_content_lines: list[str] | None = None
def draw_inner_footer(self, stdscr: curses.window, bounds, text: str) -> None:
"""
Draw footer with scroll position information.
Args:
stdscr: The curses window to draw on
bounds: The view bounds defining the drawing area
text: The footer text to display
"""
if not self._focused:
super().draw_inner_footer(stdscr, bounds, text)
return
content_lines: list[str] = self.__get_content_lines()
content_height: int = max(0, bounds.height - 4)
total_lines: int = len(content_lines)
if total_lines == 0:
super().draw_inner_footer(stdscr, bounds, text)
return
showing_start: int = self.__scroll_offset + 1
showing_end: int = min(total_lines, self.__scroll_offset + content_height)
footer_text: str = f"Viewing lines {showing_start}-{showing_end} of {total_lines}"
footer_y: int = bounds.y + bounds.height - 1
try:
safe_addstr(stdscr, footer_y, 0, self._get_bounded_line(), self._get_inner_header_style())
safe_addstr(stdscr, footer_y, 1, footer_text, self._get_inner_header_style())
except curses.error:
pass
def handle_input(self, key: int) -> bool:
"""
Handle document navigation input.
Args:
key: The curses key code from user input
Returns:
bool: True if the input was handled, False otherwise
"""
if not self._focused:
return False
handlers: dict[int, callable] = {
curses.KEY_UP: self.__scroll_up,
curses.KEY_DOWN: self.__scroll_down,
curses.KEY_PPAGE: lambda: self.__scroll_page_up(max(1, self.bounds.height - 4)),
curses.KEY_NPAGE: lambda: self.__scroll_page_down(max(1, self.bounds.height - 4)),
curses.KEY_HOME: self.__scroll_to_top,
curses.KEY_END: self.__scroll_to_bottom,
}
handler = handlers.get(key)
if handler:
handler()
return True
return False
def render(self, stdscr: curses.window) -> None:
"""
Render the help content as a scrollable document.
Args:
stdscr: The curses window to draw on
"""
if not self._renderable(stdscr):
return
y_current: int = self.bounds.y + 2
y_max: int = self.bounds.y + self.bounds.height - 1
content_height: int = max(0, self.bounds.height - 4)
content_width: int = self.bounds.width - 4
content_lines: list[str] = self.__get_content_lines()
visible_lines: list[str] = content_lines[self.__scroll_offset: self.__scroll_offset + content_height]
for i, line in enumerate(visible_lines):
line_y: int = y_current + i
if line_y >= y_max:
break
display_line: str = line[:content_width] if len(line) > content_width else line
display_line_is_bold: bool = line.startswith('##') or (line.startswith('**') and line.endswith('**') and len(line) > 4)
default_line_style = curses.A_BOLD if display_line_is_bold else curses.A_NORMAL
if INTERROBOT_LINK in line:
link_index = line.index(INTERROBOT_LINK)
safe_addstr(stdscr, line_y, 2, display_line, curses.A_NORMAL)
safe_addstr(stdscr, line_y, 2 + link_index, INTERROBOT_LINK, self.session.get_theme_color_pair(ThemeDefinition.HELP_LINK))
else:
safe_addstr(stdscr, line_y, 2, display_line, default_line_style)
def __calculate_max_scroll(self) -> int:
"""
Calculate maximum scroll offset based on content and view size.
Returns:
int: The maximum scroll offset value
"""
content_lines: list[str] = self.__get_content_lines()
content_height: int = max(0, self.bounds.height - 4)
return max(0, len(content_lines) - content_height)
def __get_content_lines(self) -> list[str]:
"""
Get wrapped content lines with caching.
Returns:
list[str]: The wrapped and cached content lines
"""
if self.__cached_content_lines is not None:
return self.__cached_content_lines
content_width: int = max(20, self.bounds.width - CONTENT_MARGIN)
wrapped_lines: list[str] = []
text_lines: list[str] = HELP_CONTENT.split("\n")
for line in text_lines:
if not line.strip():
wrapped_lines.append("")
else:
if (line.startswith('|') or
line.startswith('##') or
(line.startswith('**') and line.endswith('**'))):
wrapped_lines.append(line.rstrip())
else:
wrapped: str = textwrap.fill(
line.rstrip(),
width=content_width,
expand_tabs=True,
replace_whitespace=True,
break_long_words=True,
break_on_hyphens=True
)
wrapped_lines.extend(wrapped.split("\n"))
self.__cached_content_lines = wrapped_lines
return wrapped_lines
def __scroll_down(self, lines: int = 1) -> None:
"""
Scroll down by specified number of lines.
Args:
lines: Number of lines to scroll down
"""
max_scroll: int = self.__calculate_max_scroll()
self.__scroll_offset = min(max_scroll, self.__scroll_offset + lines)
def __scroll_page_down(self, page_size: int = 10) -> None:
"""
Scroll down by page.
Args:
page_size: Number of lines to scroll for a page
"""
self.__scroll_down(page_size)
def __scroll_page_up(self, page_size: int = 10) -> None:
"""
Scroll up by page.
Args:
page_size: Number of lines to scroll for a page
"""
self.__scroll_up(page_size)
def __scroll_to_bottom(self) -> None:
"""
Scroll to bottom of document.
"""
self.__scroll_offset = self.__calculate_max_scroll()
def __scroll_to_top(self) -> None:
"""
Scroll to top of document.
"""
self.__scroll_offset = 0
def __scroll_up(self, lines: int = 1) -> None:
"""
Scroll up by specified number of lines.
Args:
lines: Number of lines to scroll up
"""
self.__scroll_offset = max(0, self.__scroll_offset - lines)
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/tools.py:
--------------------------------------------------------------------------------
```python
from mcp.types import Tool
from mcp_server_webcrawl.models.resources import (
ResourceResultType,
RESOURCES_FIELDS_BASE,
RESOURCES_FIELDS_OPTIONS,
RESOURCES_DEFAULT_SORT_MAPPING,
RESOURCES_TOOL_NAME,
)
from mcp_server_webcrawl.models.sites import (
SiteResult,
SITES_FIELDS_DEFAULT,
SITES_FIELDS_BASE,
SITES_TOOL_NAME,
)
def get_crawler_tools(sites: list[SiteResult] | None = None):
"""
Generate crawler tools based on available sites.
Args:
sites: optional list of site results to include in tool descriptions
Returns:
List of Tool objects for sites and resources
"""
# you'd think maybe pass these in, but no, descriptions will also require tweaking
# each crawler having its own peculiarities -- just let the subclass hack this
# into whatever misshapen ball of clay it needs to be
sites_field_options = list(set(SITES_FIELDS_DEFAULT) - set(SITES_FIELDS_BASE))
resources_type_options = list(ResourceResultType.values())
resources_sort_options = list(RESOURCES_DEFAULT_SORT_MAPPING.keys())
sites_display = ", ".join([f"{s.name} (site: {s.id})" for s in sites]) if sites is not None else ""
sites_ids = [s.id for s in sites]
tools = [
Tool(
name=SITES_TOOL_NAME,
description="Retrieves a list of sites (project websites or crawl directories).",
inputSchema={
"type": "object",
"properties": {
"ids": {
"type": "array",
"items": {"type": "integer"},
"description": "List of project IDs to retrieve. Leave empty for all projects."
},
"fields": {
"type": "array",
"items": {
"enum": sites_field_options
},
"description": ("List of additional fields to include in the response beyond the defaults "
"(id, name, type, urls) Empty list means default fields only. Options include created (ISO 8601), "
"modified (ISO 8601).")
}
},
"required": []
},
),
Tool(
name=RESOURCES_TOOL_NAME,
description= ("Searches for resources (webpages, images, CSS, JS, etc.) across web crawler projects and "
"retrieves specified fields. "
"Supports boolean queries and field searching, along with site filtering to "
"filter with fine control. "
"To find a site homepage reliably, query type: html AND url: example.com (crawled domain) with sort='+url' and a LIMIT of 1. "
"This pattern works consistently across all crawlers."
"Most sites indexed by this tool will be small to moderately sized websites. "
"Don't assume most keywords will generate results; start broader on first search (until you have a feel for results). "
"A vital aspect of this API is field control; you can open up the limit wide when dealing with lightweight "
"fields and dial way back when using larger fields, like content. Adjust dynamically. The best strategy "
"balances preserving the user's context window while minimizing number of queries necessary to answer their question."
),
inputSchema={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": ("The query field is the workhorse of the API and supports fulltext boolean queries "
"along with field searching using the name: value pattern. "
"Fields supported include page/resource id as id: <resource_id|int> (OR together for multiple docs), "
"HTTP status as status: <code|int>, URL as url: <url|str>, and content type as type: <type|str>. "
f"Valid types include ({', '.join(resources_type_options)}). "
"Additionally, headers as headers: <term|str> and content as content: <term|str> can be "
"searched specifically. You would only search content when fulltext search is diluted by other field hits. "
"For the status field, numerical operators are supported, e.g. status: >=400. "
"For the url and type fields, along with fulltext search terms (fieldless), FTS5 stem* suffix "
"wildcarding is enabled. An empty query returns all results. "
"A query MUST use one of these formats: (1) empty query for unfiltered results, (2) single keyword, "
"(3) quoted phrase: \"keyword1 keyword2\", (4) "
"explicit AND: keyword1 AND type: html, (5) explicit OR: keyword1 OR keyword2, or (6) advanced boolean: "
"(keyword1 OR keyword2) AND (status: 200 AND type: html). "
"The search index does not support stemming, use wildcards (keyword*), or the boolean OR and your "
"imagination instead."
)
},
"sites": {
"type": "array",
"items": {
"enum": sites_ids
},
"description": ("List of crawl site IDs to filter search results to a specific site. In most "
"scenarios, you should filter to only one site, but multiple site filtering is offered for "
f"advanced search scenarios. Available sites include {sites_display}.")
},
"fields": {
"type": "array",
"items": {
"enum": RESOURCES_FIELDS_OPTIONS
},
"description": ("List of additional fields to include in the response beyond the base fields "
f"({', '.join(RESOURCES_FIELDS_BASE)}) returned for all results. "
"Empty list means base fields only. Use headers and content to retrieve raw HTTP contents, "
"and size to collect file size in bytes. "
"The content field can lead to large results and should be used judiciously with LIMIT. "
"Fields must be explicitly requested, even when used with sort. ")
},
"sort": {
"enum": resources_sort_options,
"default": "+url",
"description": ("Sort order for results. Prefixed with + for ascending, - for descending "
f"({', '.join(resources_sort_options)}). "
"? is a special option for random sort, useful in statistical sampling. The API expects exactly "
"one of the enum values above, not a quoted string.")
},
"limit": {
"type": "integer",
"description": "Maximum number of results to return. Default is 20, max is 100."
},
"offset": {
"type": "integer",
"description": "Number of results to skip for pagination. Default is 0."
},
"extras": {
"type": "array",
"items": {
"enum": ["thumbnails", "markdown", "snippets", "regex", "xpath"]
},
"description": ("Optional array of extra features to include in results. Available options include:\n"
"- 'thumbnails': generates base64 encoded thumbnails for image resources that can be viewed and "
"analyzed by AI models. Enables image description, content analysis, and visual understanding while"
"keeping token output minimal. Only works for image "
"(img) types, which can be filtered using `type: img` in queries. SVG is not supported.\n"
"- 'markdown': transforms the HTML content field into concise markdown, "
"reducing token usage and improving readability for LLMs.\n"
"- 'snippets': matches fulltext queries to contextual keyword usage within the content. When "
"used without requesting the content field (or markdown extra), it can provide an efficient means "
"of refining a search without pulling down the complete page contents. Also great for rendering "
"old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, "
"or any text-based, crawled file.\n"
"- 'regex': extracts regular expression matches from crawled files such as HTML, CSS, JavaScript, "
"etc.. Not as precise a tool as xpath for HTML, but supports any text file as a data source. "
"- 'xpath': extracts xpath selector data. Supports count(). Use xpath's text() for "
"text only, element selectors for HTML data. Only supported for HTML, other "
"types will be ignored. Sometimes referred to as scraping."
"")
},
"extrasRegex": {
"type": "array",
"items": {
"type": "string"
},
"description": ("Array of regular expression patterns to extract content. "
"Examples: `\\d{3}-\\d{3}-\\d{4}` (phone numbers), `https?://[^\\s]+` (URLs). "
"Use capture groups `(pattern)` to extract specific parts. "
"Only used when 'regex' is included in the extras array. "
"Results include matches, capture groups, and position information.")
},
"extrasXpath": {
"type": "array",
"items": {
"type": "string"
},
"description": ("Array of XPath expressions to extract specific content from HTML resources. "
"Each XPath should be a valid selector expression like `/html/body/h1`, `//h1/text()`, "
"//a, //a/@href, or count(//a). If you need many values (such as connected a/text() "
"and a/@href), request elements to preserve the relationship. "
"Use text() or @name when targeting text, elements will return outer HTML. "
"Only used when 'xpath' is included in the extras array. Many xpath expressions can be "
"passed at once to extract multiple selectors. Results are grouped by document within results. ")
}
},
"required": []
},
),
]
return tools
```
--------------------------------------------------------------------------------
/docs/modules.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="mcp_server_webcrawl package" href="mcp_server_webcrawl.html" />
<link rel="prev" title="Interactive Mode" href="interactive.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">mcp_server_webcrawl</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/modules.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl">
<h1>mcp_server_webcrawl<a class="headerlink" href="#mcp-server-webcrawl" title="Link to this heading"></a></h1>
<div class="toctree-wrapper compound">
<ul>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#subpackages">Subpackages</a><ul>
<li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a><ul>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html#subpackages">Subpackages</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html#module-mcp_server_webcrawl.crawlers">Module contents</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.extras.html">mcp_server_webcrawl.extras package</a><ul>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#submodules">Submodules</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.markdown">mcp_server_webcrawl.extras.markdown module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.regex">mcp_server_webcrawl.extras.regex module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.snippets">mcp_server_webcrawl.extras.snippets module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.thumbnails">mcp_server_webcrawl.extras.thumbnails module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.xpath">mcp_server_webcrawl.extras.xpath module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras">Module contents</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.interactive.html">mcp_server_webcrawl.interactive package</a><ul>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#submodules">Submodules</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.highlights">mcp_server_webcrawl.interactive.highlights module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.search">mcp_server_webcrawl.interactive.search module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.session">mcp_server_webcrawl.interactive.session module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.ui">mcp_server_webcrawl.interactive.ui module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive">Module contents</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.models.html">mcp_server_webcrawl.models package</a><ul>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#submodules">Submodules</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models.resources">mcp_server_webcrawl.models.resources module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models.sites">mcp_server_webcrawl.models.sites module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models">Module contents</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.templates.html">mcp_server_webcrawl.templates package</a><ul>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.templates.html#submodules">Submodules</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.templates.html#module-mcp_server_webcrawl.templates.tests">mcp_server_webcrawl.templates.tests module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.templates.html#module-mcp_server_webcrawl.templates">Module contents</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.utils.html">mcp_server_webcrawl.utils package</a><ul>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#submodules">Submodules</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.cli">mcp_server_webcrawl.utils.cli module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.logger">mcp_server_webcrawl.utils.logger module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.server">mcp_server_webcrawl.utils.server module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.tools">mcp_server_webcrawl.utils.tools module</a></li>
<li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils">Module contents</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#mcp-server-webcrawl-main-module">mcp_server_webcrawl.main module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#module-mcp_server_webcrawl.settings">mcp_server_webcrawl.settings module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#module-mcp_server_webcrawl.settings_local">mcp_server_webcrawl.settings_local module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#module-mcp_server_webcrawl">Module contents</a></li>
</ul>
</li>
</ul>
</div>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="interactive.html" class="btn btn-neutral float-left" title="Interactive Mode" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="mcp_server_webcrawl.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/guides/wget.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>wget MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Usage" href="../usage.html" />
<link rel="prev" title="WARC MCP Setup Guide" href="warc.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">wget MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/wget.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="wget-mcp-setup-guide">
<h1>wget MCP Setup Guide<a class="headerlink" href="#wget-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with
<a class="reference external" href="https://en.wikipedia.org/wiki/Wget">wget</a>.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve crawled.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/uqEEqVsofhc" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
<li><p>wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>mcp-server-webcrawl
</pre></div>
</div>
<p>Verify installation was successful by checking the version:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>mcp-server-webcrawl<span class="w"> </span>--help
</pre></div>
</div>
</section>
<section id="configure-claude-desktop">
<h3>2. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"wget"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"/path/to/wget/archives/"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
<li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
<li><p>Change <code class="docutils literal notranslate"><span class="pre">/path/to/wget/archives/</span></code> to your actual directory path</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="crawl-websites-with-wget">
<h3>3. Crawl Websites with wget<a class="headerlink" href="#crawl-websites-with-wget" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Terminal (macOS) or Ubuntu/WSL (Windows)</p></li>
<li><p>Navigate to your target directory for storing crawls</p></li>
<li><p>Run wget with the mirror option:</p></li>
</ol>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>wget<span class="w"> </span>--mirror<span class="w"> </span>https://example.com
</pre></div>
</div>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>In Claude Desktop, you should now see an MCP tool option under Search and Tools</p></li>
<li><p>Ask Claude to list your crawled sites:</p></li>
</ol>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li><p>Try searching content from your crawls:</p></li>
</ol>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
</pre></div>
</div>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed, and on PATH or using absolute paths</p></li>
<li><p>Check that your crawl directory path in the configuration is correct</p></li>
<li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
<li><p>Indexing for file-based archives (wget included) requires build time on first search, time is dependent on archive size</p></li>
</ul>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="warc.html" class="btn btn-neutral float-left" title="WARC MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="../usage.html" class="btn btn-neutral float-right" title="Usage" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```