This is page 5 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl?page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/parser.py:
--------------------------------------------------------------------------------
```python
import re
from ply import lex
from ply import yacc
from logging import Logger
from mcp_server_webcrawl.models.resources import RESOURCES_DEFAULT_FIELD_MAPPING
from mcp_server_webcrawl.utils.logger import get_logger
logger: Logger = get_logger()
class SearchSubquery:
"""
Subquery component in a structured search.
These are grouped into an ordered list, and are the basis the SQL query.
"""
def __init__(
self,
field: str | None,
value: str | int,
type: str,
modifiers: list[str] | None,
operator: str | None,
comparator: str = "=",
group: int | None = None,
):
"""
Initialize a SearchSubquery instance.
Args:
field: field to search, or None for fulltext search
value: search value (string or integer)
type: value type (term, phrase, wildcard, etc.)
modifiers: list of modifiers applied to the query (e.g., 'NOT')
operator: boolean operator connecting to the next subquery ('AND', 'OR', or None)
comparator: comparison operator for numerics ('=', '>', '>=', '<', '<=', '!=')
"""
self.field: str | None = field
self.value: str | int = value
self.type: str = type
self.modifiers: list[str] = modifiers or []
self.operator: str | None = operator or None
self.comparator: str = comparator
self.group: int | None = group
def get_safe_sql_field(self, field: str) -> str:
if field in RESOURCES_DEFAULT_FIELD_MAPPING:
return RESOURCES_DEFAULT_FIELD_MAPPING[field]
else:
logger.error(f"Field {field} failed to validate.")
raise Exception(f"Unknown database field {field}")
def to_dict(self) -> dict[str, str | int | list[str] | None]:
"""
Convert SearchSubquery to dictionary representation.
Args:
field: Field name to use in the dictionary (overrides self.field)
Returns:
Dictionary containing all SearchSubquery attributes
"""
return {
"field": self.field,
"value": self.value,
"type": self.type,
"modifiers": self.modifiers,
"operator": self.operator,
"comparator": self.comparator,
"group": self.group,
}
class SearchLexer:
tokens = (
"FIELD", # e.g. url:, content:
"QUOTED_STRING", # "hello world"
"TERM", # standard search term
"WILDCARD", # wildcards terms, e.g. search*
"AND",
"OR",
"NOT",
"LPAREN", # (
"RPAREN", # )
"COLON", # :
"COMPARATOR", # :>=, :>, :<, etc.
"COMP_OP", # >=
"URL_FIELD"
)
valid_fields: list[str] = ["id", "url", "status", "type", "size", "headers", "content", "time"]
t_LPAREN = r"\("
t_RPAREN = r"\)"
t_ignore = " \t\n"
def __init__(self):
self.lexer = lex.lex(module=self)
def t_COMPARATOR(self, token: lex.LexToken) -> lex.LexToken:
r":(?:>=|>|<=|<|!=|=)"
token.value = token.value[1:] # strip colon
return token
def t_COLON(self, token: lex.LexToken) -> lex.LexToken:
r":"
return token
def t_QUOTED_STRING(self, token: lex.LexToken) -> lex.LexToken:
r'"[^"]*"'
token.value = token.value[1:-1]
return token
# precedence matters
def t_URL_FIELD(self, token: lex.LexToken) -> lex.LexToken:
# this field must terminate not only on url end, but on parens
r"url\s*:\s*((?:https?://)?[^\s()]+)"
token.type = "URL_FIELD"
url_value = token.value[token.value.find(':')+1:].strip()
token.value = ("url", url_value)
return token
# precedence matters
def t_FIELD(self, token: lex.LexToken) -> lex.LexToken:
r"[a-zA-Z_][a-zA-Z0-9_]*(?=\s*:)"
if token.value not in self.valid_fields:
raise ValueError(f"Invalid field: {token.value}. Valid fields are: {', '.join(self.valid_fields)}")
return token
def t_AND(self, token: lex.LexToken) -> lex.LexToken:
r"AND\b"
return token
def t_OR(self, token: lex.LexToken) -> lex.LexToken:
r"OR\b"
return token
def t_NOT(self, token: lex.LexToken) -> lex.LexToken:
r"NOT\b"
return token
def t_WILDCARD(self, token: lex.LexToken) -> lex.LexToken:
r"[a-zA-Z0-9_\.\-\/\+]+\*"
token.value = token.value[:-1]
return token
def t_TERM(self, token: lex.LexToken) -> lex.LexToken:
r"[a-zA-Z0-9_\.\-\/\+]+"
# dedicated t_AND, t_OR, t_NOT to handle those
# this is fts5 workaround, -_ are tokenizer preserves
if re.match(r"^[\w]+[\-_][\-_\w]+$", token.value, re.UNICODE):
token.type = "QUOTED_STRING"
return token
def t_COMP_OP(self, token: lex.LexToken) -> lex.LexToken:
r">=|>|<=|<|!=|="
return token
def t_error(self, token: lex.LexToken) -> None:
logger.error(f"Illegal character '{token.value[0]}'")
token.lexer.skip(1)
class SearchParser:
tokens = SearchLexer.tokens
precedence = (
('right', 'NOT'),
('left', 'AND'),
('left', 'OR'),
)
numeric_fields: list[str] = ["id", "status", "size", "time"]
def __init__(self, lexer):
self.lexer = lexer
self.parser = yacc.yacc(module=self, debug=False)
def p_query(self, production: yacc.YaccProduction) -> None:
"""
query : expression
"""
production[0] = production[1]
def p_expression_binary(self, production: yacc.YaccProduction) -> None:
"""
expression : expression AND expression
| expression OR expression
| expression NOT expression
"""
operator = production[2]
left = production[1]
right = production[3]
# special handling for AND NOT pattern
# A AND (NOT B), treat it like A NOT B
if (operator == "AND" and isinstance(right, list) and
len(right) == 1 and "NOT" in right[0].modifiers):
# convert AND (NOT B) to binary NOT
# remove NOT modifiers
right[0].modifiers = [m for m in right[0].modifiers if m != "NOT"]
operator = "NOT"
if operator == "NOT":
# NOT handled as set difference, left EXCEPT right
# mark this as a special NOT relationship
if isinstance(left, list) and isinstance(right, list):
if left:
left[-1].operator = "NOT"
production[0] = left + right
elif isinstance(left, list):
if left:
left[-1].operator = "NOT"
production[0] = left + [self.__create_subquery(right, None)]
elif isinstance(right, list):
production[0] = [self.__create_subquery(left, "NOT")] + right
else:
# both terms, subqueries for both
production[0] = [
self.__create_subquery(left, "NOT"),
self.__create_subquery(right, None)
]
else:
# handle AND and OR as before
if isinstance(left, list) and isinstance(right, list):
if left:
left[-1].operator = operator
production[0] = left + right
elif isinstance(left, list):
if left:
left[-1].operator = operator
production[0] = left + [self.__create_subquery(right, operator)]
elif isinstance(right, list):
production[0] = [self.__create_subquery(left, operator)] + right
else:
production[0] = [
self.__create_subquery(left, operator),
self.__create_subquery(right, None)
]
def p_expression_not(self, production: yacc.YaccProduction) -> None:
"""
expression : NOT expression
"""
# handle unary NOT (prefix NOT)
expr = production[2]
if isinstance(expr, list):
for item in expr:
item.modifiers.append("NOT")
production[0] = expr
else:
subquery = self.__create_subquery(expr, None)
subquery.modifiers.append("NOT")
production[0] = [subquery]
def p_expression_group(self, production: yacc.YaccProduction) -> None:
"""
expression : LPAREN expression RPAREN
"""
# production[0] = production[2]
expr = production[2]
group_id = id(production) # Unique ID for this parentheses group
# Mark all subqueries in this expression with the group
if isinstance(expr, list):
for subquery in expr:
subquery.group = group_id
else:
expr.group = group_id
production[0] = expr
def p_expression_url_field(self, production: yacc.YaccProduction) -> None:
"""
expression : URL_FIELD
"""
field, value = production[1] # Unpack the tuple (field, value)
# check if URL ends with * for wildcard matching
value_type = "term"
if value.endswith('*'):
value = value[:-1] # remove wildcard
value_type = "wildcard"
production[0] = SearchSubquery(
field=field,
value=value,
type=value_type,
modifiers=[],
operator=None
)
def p_value(self, production: yacc.YaccProduction) -> None:
"""
value : TERM
| WILDCARD
| QUOTED_STRING
"""
value = production[1]
value_type = "term"
if production.slice[1].type == "WILDCARD":
value_type = "wildcard"
elif production.slice[1].type == "QUOTED_STRING":
value_type = "phrase"
production[0] = {"value": value, "type": value_type}
def p_expression_term(self, production: yacc.YaccProduction) -> None:
"""
expression : value
"""
term = production[1]
production[0] = SearchSubquery(
field=None, # no field means fulltext search
value=term["value"],
type=term["type"],
modifiers=[],
operator=None
)
def p_expression_field_search(self, production: yacc.YaccProduction) -> None:
"""
expression : FIELD COLON COMP_OP value
| FIELD COLON value
| FIELD COMPARATOR value
"""
field = production[1]
# determine comparator and value based on pattern
if len(production) == 5: # FIELD COLON COMP_OP value
comparator = production[3]
value = production[4]
elif len(production) == 4:
# check second token, COLON or COMPARATOR
if production[2] == ":": # FIELD COLON value
comparator = "=" # default equals
value = production[3]
else:
comparator = production[2]
value = production[3]
production[0] = self.__create_field_subquery(field, value, comparator)
def __create_field_subquery(self, field: str, value_dict: dict[str, str] | str | int, comparator: str = "=") -> SearchSubquery:
"""
Helper method to create SearchSubquery for field searches.
Consolidates all the validation and conversion logic.
"""
self.__validate_comparator_for_field(field, comparator)
processed_value = self.__process_field_value(field, value_dict)
value_type = value_dict.get("type", "term") if isinstance(value_dict, dict) else "term"
return SearchSubquery(
field=field,
value=processed_value,
type=value_type,
modifiers=[],
operator=None,
comparator=comparator
)
def __create_subquery(self, term, operator: str | None):
"""
Helper to create a SearchSubquery instance.
"""
assert isinstance(term, SearchSubquery), "__create_subquery expected a SearchSubquery instance"
return SearchSubquery(
field=term.field,
value=term.value,
type=term.type,
modifiers=term.modifiers.copy(),
operator=operator,
comparator=term.comparator,
group=term.group,
)
def __process_field_value(
self,
field: str | None,
value_dict: dict[str, str] | str | int,
swap_values: dict[str, dict[str, str | int]] | None = None
) -> str | int | float:
"""
Process and validate a field value with type conversion and swapping.
Args:
field: The field name (or None for fulltext)
value_dict: Dictionary with 'value' and 'type' keys, or raw value
swap_values: Optional dictionary for value replacement
Returns:
Processed value (string, int, or float)
"""
if isinstance(value_dict, dict):
value = value_dict["value"]
else:
value = value_dict # raw value
if swap_values:
swap_key = field if field else ""
if swap_key in swap_values and value in swap_values[swap_key]:
value = swap_values[swap_key][value]
if field and field in self.numeric_fields:
try:
return int(value)
except ValueError:
try:
return float(value)
except ValueError:
raise ValueError(f"Field {field} requires a numeric value, got: {value}")
return value
def __validate_comparator_for_field(self, field: str, comparator: str) -> None:
"""
Validate that a comparator is appropriate for the given field.
Args:
field: The field name
comparator: The comparison operator
Raises:
ValueError: If comparator is invalid for the field type
"""
if comparator != "=" and field not in self.numeric_fields:
raise ValueError(f"Comparison operator '{comparator}' can only be used with numeric fields")
def p_error(self, production: yacc.YaccProduction | None) -> None:
if production:
logger.info(f"Syntax error at '{production.value}'")
else:
logger.info("Syntax error at EOF")
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/requirements.py:
--------------------------------------------------------------------------------
```python
import curses
import os
import traceback
from enum import Enum, auto
from pathlib import Path
from typing import TYPE_CHECKING
from mcp_server_webcrawl.crawlers import VALID_CRAWLER_CHOICES, get_crawler
from mcp_server_webcrawl.crawlers.base.api import BaseJsonApi
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.interactive.ui import InputRadioGroup, InputText, ThemeDefinition, UiState
from mcp_server_webcrawl.interactive.views.base import BaseCursesView
from mcp_server_webcrawl.interactive.ui import safe_addstr
from mcp_server_webcrawl.interactive.views.searchform import SearchFormView
if TYPE_CHECKING:
from mcp_server_webcrawl.interactive.session import InteractiveSession
LAYOUT_BOX_MAX_WIDTH = 60
LAYOUT_BOX_MARGIN = 8
VALIDATION_HEADER_X_OFFSET = 24
VALIDATION_TEXT_INDENT = 2
class RequirementsFormField(Enum):
DATASRC = auto()
CRAWLER = auto()
class RequirementsView(BaseCursesView):
"""
Interactive requirements view for configuring crawler and data source.
"""
def __init__(self, session: 'InteractiveSession', crawler: str, datasrc: str):
"""
Initialize the requirements view.
Args:
session: The interactive session instance
crawler: Initial crawler type selection
datasrc: Initial data source path
"""
super().__init__(session)
self.__validated: bool = self.__validate(crawler, datasrc)
self.__form_selected_field: RequirementsFormField = RequirementsFormField.DATASRC
self.__form_selected_index: int = 0
initial_datasrc: str = datasrc if datasrc is not None else self.__get_default_directory()
self.__datasrc_input: InputText = InputText(initial_value=initial_datasrc, label="Data Source Path")
self.__crawler_group: InputRadioGroup = InputRadioGroup("crawler")
if not self.__validated:
detected_crawler: str | None
detected_datasrc: str | None
detected_crawler, detected_datasrc = self.__autosense_crawler_and_datasrc()
initial_crawler: str = crawler if crawler is not None else detected_crawler
initial_datasrc = datasrc if datasrc is not None else detected_datasrc
self.__set_initial_crawler_selection(initial_crawler)
self.__datasrc_input.set_value(initial_datasrc)
self._focused: bool = True
@property
def validated(self) -> bool:
return self.__validated
def handle_input(self, key: int) -> bool:
"""
Handle keyboard input for requirements form navigation and validation.
Args:
key: The curses key code from user input
Returns:
bool: True if the input was handled, False otherwise
"""
handlers: dict[int, callable] = {
curses.KEY_UP: self.__navigate_form_selection_up,
curses.KEY_DOWN: self.__navigate_form_selection_down,
ord('\t'): self.__handle_tab,
ord(' '): self.__handle_spacebar,
ord('\n'): self.__handle_enter,
ord('\r'): self.__handle_enter,
}
handler = handlers.get(key)
if handler:
handler()
return True
if (self.__form_selected_field == RequirementsFormField.DATASRC and
self.__form_selected_index == 0):
return self.__datasrc_input.handle_input(key)
return False
def render(self, stdscr: curses.window) -> None:
"""
Render the requirements form showing crawler selection and datasrc input.
Args:
stdscr: The curses window to draw on
"""
xb: int = self.bounds.x
yb: int = self.bounds.y
y_current: int = yb + 2
# y_max: int = yb + self.bounds.height
safe_addstr(stdscr, y_current, xb + 2, "Data Source Path:", curses.A_BOLD)
y_current += 1
box_width: int = min(LAYOUT_BOX_MAX_WIDTH, self.bounds.width - LAYOUT_BOX_MARGIN)
is_datasrc_selected: bool = (
self.__form_selected_field == RequirementsFormField.DATASRC
and self.__form_selected_index == 0
)
field_style: int
if is_datasrc_selected:
field_style = curses.A_REVERSE
else:
field_style = self.session.get_theme_color_pair(ThemeDefinition.INACTIVE_QUERY)
self.__datasrc_input.render(stdscr, y_current, xb + 4, box_width,
focused=is_datasrc_selected, style=field_style)
y_current += 2
crawler_y_start: int = y_current
safe_addstr(stdscr, y_current, xb + 2, self.__crawler_group.label, curses.A_BOLD)
y_current += 1
for i, radio in enumerate(self.__crawler_group.radios):
crawler_field_index: int = i + 1
is_crawler_field_selected: bool = (self.__form_selected_field == RequirementsFormField.CRAWLER and
self.__form_selected_index == crawler_field_index)
radio.render(stdscr, y_current, xb + 4, crawler_field_index, 100, is_crawler_field_selected)
y_current += 1
validation_y: int = crawler_y_start
selected_crawler: str = self.__crawler_group.value
crawler_valid: bool = selected_crawler in VALID_CRAWLER_CHOICES
crawler_symbol: str = "🗹" if crawler_valid else "☒"
crawler_style: int
if crawler_valid:
crawler_style = curses.A_NORMAL
else:
crawler_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
datasrc_path: str = self.__datasrc_input.value
datasrc_path_obj: Path = Path(datasrc_path)
datasrc_exists: bool = datasrc_path_obj.exists()
datasrc_symbol: str
datasrc_valid: bool
if not datasrc_exists:
datasrc_symbol = "☒"
datasrc_valid = False
else:
is_correct_type: bool
if selected_crawler in ("interrobot", "warc"):
is_correct_type = datasrc_path_obj.is_file()
else:
is_correct_type = datasrc_path_obj.is_dir()
datasrc_symbol = "🗹" if is_correct_type else "☒"
datasrc_valid = is_correct_type
datasrc_style: int
if datasrc_valid:
datasrc_style = curses.A_NORMAL
else:
datasrc_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
validation_header: str = "Validation Status:"
header_x: int = xb + VALIDATION_HEADER_X_OFFSET
safe_addstr(stdscr, validation_y, header_x, validation_header, curses.A_BOLD)
validation_y += 1
validation_word_x: int = header_x
crawler_text: str = f"{crawler_symbol} --crawler"
safe_addstr(stdscr, validation_y, validation_word_x, " ", curses.A_NORMAL)
safe_addstr(stdscr, validation_y, validation_word_x + VALIDATION_TEXT_INDENT, crawler_text, crawler_style)
validation_y += 1
datasrc_text: str = f"{datasrc_symbol} --datasrc"
safe_addstr(stdscr, validation_y, validation_word_x, " ", curses.A_NORMAL)
safe_addstr(stdscr, validation_y, validation_word_x + VALIDATION_TEXT_INDENT, datasrc_text, datasrc_style)
def __autosense_crawler_and_datasrc(self) -> tuple[str, str] | tuple[None, None]:
"""
Auto-detect crawler type and datasrc based on cwd and parent directory signatures.
Returns:
tuple: (crawler, datasrc) tuple or (None, None) if no match found
"""
cwd: Path = Path(os.getcwd()).absolute()
if list(cwd.glob("*.v2.db")):
db_file: Path = next(cwd.glob("*.v2.db"))
return ("interrobot", str(db_file))
archive_directories: list[Path] = list(cwd.glob("*/archive"))
if archive_directories:
for archive_directory in archive_directories:
timestamp_directories: list[Path] = [d for d in archive_directory.iterdir()
if d.is_dir() and d.name.replace('.', '').isdigit()]
if timestamp_directories:
return ("archivebox", str(cwd))
if list(cwd.glob("*/output.*.txt")):
return ("siteone", str(cwd))
if list(cwd.glob("*/hts-log.txt")) or list(cwd.glob("*/*/hts-log.txt")):
return ("httrack", str(cwd))
katana_files: list[Path] = list(cwd.glob("*/*/*.txt"))
for f in katana_files:
if len(f.stem) == 40 and all(c in '0123456789abcdef' for c in f.stem.lower()):
return ("katana", str(cwd))
warc_files: list[Path] = list(cwd.glob("*.warc.gz")) + list(cwd.glob("*.warc"))
if warc_files:
return ("warc", str(cwd))
if list(cwd.glob("*/index.html")):
return ("wget", str(cwd))
return ("wget", self.__get_default_directory())
def __get_default_directory(self) -> str:
"""
Get the default directory path.
Returns:
str: The absolute path of the current working directory
"""
return str(Path(os.getcwd()).absolute())
def __handle_enter(self) -> None:
"""
Handle ENTER key to revalidate in datasrc field or toggle in crawler field.
"""
if self.__form_selected_field == RequirementsFormField.DATASRC:
selected_crawler: str = self.__crawler_group.value
self.__validated = self.__validate(selected_crawler, self.__datasrc_input.value)
self.__update_session()
if self.__validated:
self.session.set_ui_state(UiState.SEARCH_INIT)
elif self.__form_selected_field == RequirementsFormField.CRAWLER:
crawler_index: int = self.__form_selected_index - 1
if 0 <= crawler_index < len(self.__crawler_group.radios):
self.__crawler_group.radios[crawler_index].next_state()
def __handle_spacebar(self) -> None:
"""
Handle spacebar to toggle crawler selection or add space to datasrc.
"""
if self.__form_selected_field == RequirementsFormField.DATASRC:
self.__datasrc_input.handle_input(ord(" "))
elif self.__form_selected_field == RequirementsFormField.CRAWLER:
crawler_index: int = self.__form_selected_index - 1
if 0 <= crawler_index < len(self.__crawler_group.radios):
self.__crawler_group.radios[crawler_index].next_state()
def __handle_tab(self) -> None:
"""
Handle TAB key to switch between field groups.
"""
if self.__form_selected_field == RequirementsFormField.DATASRC:
self.__form_selected_field = RequirementsFormField.CRAWLER
self.__form_selected_index = 1
else:
self.__form_selected_field = RequirementsFormField.DATASRC
self.__form_selected_index = 0
def __navigate_form_selection_down(self) -> None:
"""
Navigate down within current field or switch to next field group.
"""
if self.__form_selected_field == RequirementsFormField.DATASRC:
self.__form_selected_field = RequirementsFormField.CRAWLER
self.__form_selected_index = 1
elif self.__form_selected_field == RequirementsFormField.CRAWLER:
if self.__form_selected_index < len(self.__crawler_group.radios):
self.__form_selected_index += 1
else:
self.__form_selected_field = RequirementsFormField.DATASRC
self.__form_selected_index = 0
def __navigate_form_selection_up(self) -> None:
"""
Navigate up within current field or switch to previous field group.
"""
if self.__form_selected_field == RequirementsFormField.DATASRC:
self.__form_selected_field = RequirementsFormField.CRAWLER
self.__form_selected_index = len(self.__crawler_group.radios)
elif self.__form_selected_field == RequirementsFormField.CRAWLER:
if self.__form_selected_index > 1:
self.__form_selected_index -= 1
else:
self.__form_selected_field = RequirementsFormField.DATASRC
self.__form_selected_index = 0
def __set_initial_crawler_selection(self, initial_crawler: str) -> None:
"""
Set the initial crawler selection in the radio group.
Args:
initial_crawler: The crawler type to initially select
"""
if initial_crawler in VALID_CRAWLER_CHOICES:
crawler_index: int = VALID_CRAWLER_CHOICES.index(initial_crawler)
if 0 <= crawler_index < len(self.__crawler_group.radios):
self.__crawler_group.radios[crawler_index].next_state()
def __update_session(self) -> None:
"""
Update the session with current form values.
"""
# push a new app configuration into the ui
selected_crawler: str = self.__crawler_group.value
self.session.set_init_input_args(selected_crawler, self.__datasrc_input.value)
if self.__validated:
try:
crawl_model: BaseCrawler = get_crawler(selected_crawler)
crawler: BaseCrawler = crawl_model(Path(self.__datasrc_input.value))
self.session.set_init_crawler(crawler)
sites_api: BaseJsonApi = self.session.crawler.get_sites_api()
self.session.set_init_sites(sites_api.get_results())
searchform: SearchFormView = SearchFormView(
self.session,
self.session.sites
)
self.session.set_init_searchform(searchform)
except Exception as ex:
self.session.debug_add(f"Error initializing crawler: {ex}\n{traceback.format_exc()}")
self.__validated = False
def __validate(self, crawler: str, datasrc: str) -> bool:
"""
Validate crawler and datasrc combination.
Args:
crawler: The crawler type to validate
datasrc: The data source path to validate
Returns:
bool: True if the combination is valid, False otherwise
"""
if not isinstance(datasrc, str) or not isinstance(crawler, str):
return False
crawler_valid: bool = crawler in VALID_CRAWLER_CHOICES
if datasrc in (None, ""):
return False
datasrc_path: Path = Path(datasrc)
if not datasrc_path.exists():
return False
if crawler in ("interrobot", "warc"):
datasrc_valid = datasrc_path.is_file()
else:
datasrc_valid = datasrc_path.is_dir()
return crawler_valid and datasrc_valid
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/wget/tests.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.wget.tests — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.wget.tests</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.wget.tests</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.adapter</span> <span class="kn">import</span> <span class="n">WgetManager</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="n">WgetManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"example.com"</span><span class="p">)</span>
<span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="n">WgetManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"pragmar.com"</span><span class="p">)</span>
<div class="viewcode-block" id="WgetTests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests">[docs]</a>
<span class="k">class</span> <span class="nc">WgetTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test suite for the wget crawler implementation.</span>
<span class="sd"> Uses all wrapped test methods from BaseCrawlerTests.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="WgetTests.setUp">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.setUp">[docs]</a>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Set up the test environment with fixture data.</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"wget"</span></div>
<div class="viewcode-block" id="WgetTests.test_wget_pulse">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_pulse">[docs]</a>
<span class="k">def</span> <span class="nf">test_wget_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test basic crawler initialization.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
<div class="viewcode-block" id="WgetTests.test_wget_sites">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sites">[docs]</a>
<span class="k">def</span> <span class="nf">test_wget_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test site retrieval API functionality.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="WgetTests.test_wget_search">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_search">[docs]</a>
<span class="k">def</span> <span class="nf">test_wget_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test boolean search functionality</span>
<span class="sd"> """</span>
<span class="c1"># moved fixtures to own repo, lost some local media,</span>
<span class="c1"># but checks out. wget fixture has no CSS/JS/etc.</span>
<span class="c1"># HTML-only and just doesn't do well with the full array of</span>
<span class="c1"># tests concerning fulltext, media, and mixed search result</span>
<span class="c1"># counts. probably needs a reduced set of tests</span>
<span class="c1"># self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)</span>
<span class="k">return</span></div>
<div class="viewcode-block" id="WgetTests.test_wget_resources">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_resources">[docs]</a>
<span class="k">def</span> <span class="nf">test_wget_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test resource retrieval API functionality with various parameters.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="WgetTests.test_wget_sorts">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sorts">[docs]</a>
<span class="k">def</span> <span class="nf">test_wget_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test random sort functionality using the '?' sort parameter.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="WgetTests.test_wget_content_parsing">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_content_parsing">[docs]</a>
<span class="k">def</span> <span class="nf">test_wget_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test content type detection and parsing.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="WgetTests.test_report">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_report">[docs]</a>
<span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Run test report, save to data directory.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">"wget"</span><span class="p">))</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_static/basic.css:
--------------------------------------------------------------------------------
```css
/*
* basic.css
* ~~~~~~~~~
*
* Sphinx stylesheet -- basic theme.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
/* -- main layout ----------------------------------------------------------- */
div.clearer {
clear: both;
}
div.section::after {
display: block;
content: '';
clear: left;
}
/* -- relbar ---------------------------------------------------------------- */
div.related {
width: 100%;
font-size: 90%;
}
div.related h3 {
display: none;
}
div.related ul {
margin: 0;
padding: 0 0 0 10px;
list-style: none;
}
div.related li {
display: inline;
}
div.related li.right {
float: right;
margin-right: 5px;
}
/* -- sidebar --------------------------------------------------------------- */
div.sphinxsidebarwrapper {
padding: 10px 5px 0 10px;
}
div.sphinxsidebar {
float: left;
width: 230px;
margin-left: -100%;
font-size: 90%;
word-wrap: break-word;
overflow-wrap : break-word;
}
div.sphinxsidebar ul {
list-style: none;
}
div.sphinxsidebar ul ul,
div.sphinxsidebar ul.want-points {
margin-left: 20px;
list-style: square;
}
div.sphinxsidebar ul ul {
margin-top: 0;
margin-bottom: 0;
}
div.sphinxsidebar form {
margin-top: 10px;
}
div.sphinxsidebar input {
border: 1px solid #98dbcc;
font-family: sans-serif;
font-size: 1em;
}
div.sphinxsidebar #searchbox form.search {
overflow: hidden;
}
div.sphinxsidebar #searchbox input[type="text"] {
float: left;
width: 80%;
padding: 0.25em;
box-sizing: border-box;
}
div.sphinxsidebar #searchbox input[type="submit"] {
float: left;
width: 20%;
border-left: none;
padding: 0.25em;
box-sizing: border-box;
}
img {
border: 0;
max-width: 100%;
}
/* -- search page ----------------------------------------------------------- */
ul.search {
margin: 10px 0 0 20px;
padding: 0;
}
ul.search li {
padding: 5px 0 5px 20px;
background-image: url(file.png);
background-repeat: no-repeat;
background-position: 0 7px;
}
ul.search li a {
font-weight: bold;
}
ul.search li p.context {
color: #888;
margin: 2px 0 0 30px;
text-align: left;
}
ul.keywordmatches li.goodmatch a {
font-weight: bold;
}
/* -- index page ------------------------------------------------------------ */
table.contentstable {
width: 90%;
margin-left: auto;
margin-right: auto;
}
table.contentstable p.biglink {
line-height: 150%;
}
a.biglink {
font-size: 1.3em;
}
span.linkdescr {
font-style: italic;
padding-top: 5px;
font-size: 90%;
}
/* -- general index --------------------------------------------------------- */
table.indextable {
width: 100%;
}
table.indextable td {
text-align: left;
vertical-align: top;
}
table.indextable ul {
margin-top: 0;
margin-bottom: 0;
list-style-type: none;
}
table.indextable > tbody > tr > td > ul {
padding-left: 0em;
}
table.indextable tr.pcap {
height: 10px;
}
table.indextable tr.cap {
margin-top: 10px;
background-color: #f2f2f2;
}
img.toggler {
margin-right: 3px;
margin-top: 3px;
cursor: pointer;
}
div.modindex-jumpbox {
border-top: 1px solid #ddd;
border-bottom: 1px solid #ddd;
margin: 1em 0 1em 0;
padding: 0.4em;
}
div.genindex-jumpbox {
border-top: 1px solid #ddd;
border-bottom: 1px solid #ddd;
margin: 1em 0 1em 0;
padding: 0.4em;
}
/* -- domain module index --------------------------------------------------- */
table.modindextable td {
padding: 2px;
border-collapse: collapse;
}
/* -- general body styles --------------------------------------------------- */
div.body {
min-width: 360px;
max-width: 800px;
}
div.body p, div.body dd, div.body li, div.body blockquote {
-moz-hyphens: auto;
-ms-hyphens: auto;
-webkit-hyphens: auto;
hyphens: auto;
}
a.headerlink {
visibility: hidden;
}
a:visited {
color: #551A8B;
}
h1:hover > a.headerlink,
h2:hover > a.headerlink,
h3:hover > a.headerlink,
h4:hover > a.headerlink,
h5:hover > a.headerlink,
h6:hover > a.headerlink,
dt:hover > a.headerlink,
caption:hover > a.headerlink,
p.caption:hover > a.headerlink,
div.code-block-caption:hover > a.headerlink {
visibility: visible;
}
div.body p.caption {
text-align: inherit;
}
div.body td {
text-align: left;
}
.first {
margin-top: 0 !important;
}
p.rubric {
margin-top: 30px;
font-weight: bold;
}
img.align-left, figure.align-left, .figure.align-left, object.align-left {
clear: left;
float: left;
margin-right: 1em;
}
img.align-right, figure.align-right, .figure.align-right, object.align-right {
clear: right;
float: right;
margin-left: 1em;
}
img.align-center, figure.align-center, .figure.align-center, object.align-center {
display: block;
margin-left: auto;
margin-right: auto;
}
img.align-default, figure.align-default, .figure.align-default {
display: block;
margin-left: auto;
margin-right: auto;
}
.align-left {
text-align: left;
}
.align-center {
text-align: center;
}
.align-default {
text-align: center;
}
.align-right {
text-align: right;
}
/* -- sidebars -------------------------------------------------------------- */
div.sidebar,
aside.sidebar {
margin: 0 0 0.5em 1em;
border: 1px solid #ddb;
padding: 7px;
background-color: #ffe;
width: 40%;
float: right;
clear: right;
overflow-x: auto;
}
p.sidebar-title {
font-weight: bold;
}
nav.contents,
aside.topic,
div.admonition, div.topic, blockquote {
clear: left;
}
/* -- topics ---------------------------------------------------------------- */
nav.contents,
aside.topic,
div.topic {
border: 1px solid #ccc;
padding: 7px;
margin: 10px 0 10px 0;
}
p.topic-title {
font-size: 1.1em;
font-weight: bold;
margin-top: 10px;
}
/* -- admonitions ----------------------------------------------------------- */
div.admonition {
margin-top: 10px;
margin-bottom: 10px;
padding: 7px;
}
div.admonition dt {
font-weight: bold;
}
p.admonition-title {
margin: 0px 10px 5px 0px;
font-weight: bold;
}
div.body p.centered {
text-align: center;
margin-top: 25px;
}
/* -- content of sidebars/topics/admonitions -------------------------------- */
div.sidebar > :last-child,
aside.sidebar > :last-child,
nav.contents > :last-child,
aside.topic > :last-child,
div.topic > :last-child,
div.admonition > :last-child {
margin-bottom: 0;
}
div.sidebar::after,
aside.sidebar::after,
nav.contents::after,
aside.topic::after,
div.topic::after,
div.admonition::after,
blockquote::after {
display: block;
content: '';
clear: both;
}
/* -- tables ---------------------------------------------------------------- */
table.docutils {
margin-top: 10px;
margin-bottom: 10px;
border: 0;
border-collapse: collapse;
}
table.align-center {
margin-left: auto;
margin-right: auto;
}
table.align-default {
margin-left: auto;
margin-right: auto;
}
table caption span.caption-number {
font-style: italic;
}
table caption span.caption-text {
}
table.docutils td, table.docutils th {
padding: 1px 8px 1px 5px;
border-top: 0;
border-left: 0;
border-right: 0;
border-bottom: 1px solid #aaa;
}
th {
text-align: left;
padding-right: 5px;
}
table.citation {
border-left: solid 1px gray;
margin-left: 1px;
}
table.citation td {
border-bottom: none;
}
th > :first-child,
td > :first-child {
margin-top: 0px;
}
th > :last-child,
td > :last-child {
margin-bottom: 0px;
}
/* -- figures --------------------------------------------------------------- */
div.figure, figure {
margin: 0.5em;
padding: 0.5em;
}
div.figure p.caption, figcaption {
padding: 0.3em;
}
div.figure p.caption span.caption-number,
figcaption span.caption-number {
font-style: italic;
}
div.figure p.caption span.caption-text,
figcaption span.caption-text {
}
/* -- field list styles ----------------------------------------------------- */
table.field-list td, table.field-list th {
border: 0 !important;
}
.field-list ul {
margin: 0;
padding-left: 1em;
}
.field-list p {
margin: 0;
}
.field-name {
-moz-hyphens: manual;
-ms-hyphens: manual;
-webkit-hyphens: manual;
hyphens: manual;
}
/* -- hlist styles ---------------------------------------------------------- */
table.hlist {
margin: 1em 0;
}
table.hlist td {
vertical-align: top;
}
/* -- object description styles --------------------------------------------- */
.sig {
font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
}
.sig-name, code.descname {
background-color: transparent;
font-weight: bold;
}
.sig-name {
font-size: 1.1em;
}
code.descname {
font-size: 1.2em;
}
.sig-prename, code.descclassname {
background-color: transparent;
}
.optional {
font-size: 1.3em;
}
.sig-paren {
font-size: larger;
}
.sig-param.n {
font-style: italic;
}
/* C++ specific styling */
.sig-inline.c-texpr,
.sig-inline.cpp-texpr {
font-family: unset;
}
.sig.c .k, .sig.c .kt,
.sig.cpp .k, .sig.cpp .kt {
color: #0033B3;
}
.sig.c .m,
.sig.cpp .m {
color: #1750EB;
}
.sig.c .s, .sig.c .sc,
.sig.cpp .s, .sig.cpp .sc {
color: #067D17;
}
/* -- other body styles ----------------------------------------------------- */
ol.arabic {
list-style: decimal;
}
ol.loweralpha {
list-style: lower-alpha;
}
ol.upperalpha {
list-style: upper-alpha;
}
ol.lowerroman {
list-style: lower-roman;
}
ol.upperroman {
list-style: upper-roman;
}
:not(li) > ol > li:first-child > :first-child,
:not(li) > ul > li:first-child > :first-child {
margin-top: 0px;
}
:not(li) > ol > li:last-child > :last-child,
:not(li) > ul > li:last-child > :last-child {
margin-bottom: 0px;
}
ol.simple ol p,
ol.simple ul p,
ul.simple ol p,
ul.simple ul p {
margin-top: 0;
}
ol.simple > li:not(:first-child) > p,
ul.simple > li:not(:first-child) > p {
margin-top: 0;
}
ol.simple p,
ul.simple p {
margin-bottom: 0;
}
aside.footnote > span,
div.citation > span {
float: left;
}
aside.footnote > span:last-of-type,
div.citation > span:last-of-type {
padding-right: 0.5em;
}
aside.footnote > p {
margin-left: 2em;
}
div.citation > p {
margin-left: 4em;
}
aside.footnote > p:last-of-type,
div.citation > p:last-of-type {
margin-bottom: 0em;
}
aside.footnote > p:last-of-type:after,
div.citation > p:last-of-type:after {
content: "";
clear: both;
}
dl.field-list {
display: grid;
grid-template-columns: fit-content(30%) auto;
}
dl.field-list > dt {
font-weight: bold;
word-break: break-word;
padding-left: 0.5em;
padding-right: 5px;
}
dl.field-list > dd {
padding-left: 0.5em;
margin-top: 0em;
margin-left: 0em;
margin-bottom: 0em;
}
dl {
margin-bottom: 15px;
}
dd > :first-child {
margin-top: 0px;
}
dd ul, dd table {
margin-bottom: 10px;
}
dd {
margin-top: 3px;
margin-bottom: 10px;
margin-left: 30px;
}
.sig dd {
margin-top: 0px;
margin-bottom: 0px;
}
.sig dl {
margin-top: 0px;
margin-bottom: 0px;
}
dl > dd:last-child,
dl > dd:last-child > :last-child {
margin-bottom: 0;
}
dt:target, span.highlighted {
background-color: #fbe54e;
}
rect.highlighted {
fill: #fbe54e;
}
dl.glossary dt {
font-weight: bold;
font-size: 1.1em;
}
.versionmodified {
font-style: italic;
}
.system-message {
background-color: #fda;
padding: 5px;
border: 3px solid red;
}
.footnote:target {
background-color: #ffa;
}
.line-block {
display: block;
margin-top: 1em;
margin-bottom: 1em;
}
.line-block .line-block {
margin-top: 0;
margin-bottom: 0;
margin-left: 1.5em;
}
.guilabel, .menuselection {
font-family: sans-serif;
}
.accelerator {
text-decoration: underline;
}
.classifier {
font-style: oblique;
}
.classifier:before {
font-style: normal;
margin: 0 0.5em;
content: ":";
display: inline-block;
}
abbr, acronym {
border-bottom: dotted 1px;
cursor: help;
}
.translated {
background-color: rgba(207, 255, 207, 0.2)
}
.untranslated {
background-color: rgba(255, 207, 207, 0.2)
}
/* -- code displays --------------------------------------------------------- */
pre {
overflow: auto;
overflow-y: hidden; /* fixes display issues on Chrome browsers */
}
pre, div[class*="highlight-"] {
clear: both;
}
span.pre {
-moz-hyphens: none;
-ms-hyphens: none;
-webkit-hyphens: none;
hyphens: none;
white-space: nowrap;
}
div[class*="highlight-"] {
margin: 1em 0;
}
td.linenos pre {
border: 0;
background-color: transparent;
color: #aaa;
}
table.highlighttable {
display: block;
}
table.highlighttable tbody {
display: block;
}
table.highlighttable tr {
display: flex;
}
table.highlighttable td {
margin: 0;
padding: 0;
}
table.highlighttable td.linenos {
padding-right: 0.5em;
}
table.highlighttable td.code {
flex: 1;
overflow: hidden;
}
.highlight .hll {
display: block;
}
div.highlight pre,
table.highlighttable pre {
margin: 0;
}
div.code-block-caption + div {
margin-top: 0;
}
div.code-block-caption {
margin-top: 1em;
padding: 2px 5px;
font-size: small;
}
div.code-block-caption code {
background-color: transparent;
}
table.highlighttable td.linenos,
span.linenos,
div.highlight span.gp { /* gp: Generic.Prompt */
user-select: none;
-webkit-user-select: text; /* Safari fallback only */
-webkit-user-select: none; /* Chrome/Safari */
-moz-user-select: none; /* Firefox */
-ms-user-select: none; /* IE10+ */
}
div.code-block-caption span.caption-number {
padding: 0.1em 0.3em;
font-style: italic;
}
div.code-block-caption span.caption-text {
}
div.literal-block-wrapper {
margin: 1em 0;
}
code.xref, a code {
background-color: transparent;
font-weight: bold;
}
h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
background-color: transparent;
}
.viewcode-link {
float: right;
}
.viewcode-back {
float: right;
font-family: sans-serif;
}
div.viewcode-block:target {
margin: -1px -10px;
padding: 0 10px;
}
/* -- math display ---------------------------------------------------------- */
img.math {
vertical-align: middle;
}
div.body div.math p {
text-align: center;
}
span.eqno {
float: right;
}
span.eqno a.headerlink {
position: absolute;
z-index: 1;
}
div.math:hover a.headerlink {
visibility: visible;
}
/* -- printout stylesheet --------------------------------------------------- */
@media print {
div.document,
div.documentwrapper,
div.bodywrapper {
margin: 0 !important;
width: 100%;
}
div.sphinxsidebar,
div.related,
div.footer,
#top-link {
display: none;
}
}
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/models/sites.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.models.sites — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.models.sites</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.models.sites</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span> <span class="nn">enum</span> <span class="kn">import</span> <span class="n">Enum</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.base</span> <span class="kn">import</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils</span> <span class="kn">import</span> <span class="n">to_isoformat_zulu</span>
<div class="viewcode-block" id="SiteType">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteType">[docs]</a>
<span class="k">class</span> <span class="nc">SiteType</span><span class="p">(</span><span class="n">Enum</span><span class="p">):</span>
<span class="n">UNDEFINED</span> <span class="o">=</span> <span class="s2">"undefined"</span>
<span class="n">CRAWLED_URL</span> <span class="o">=</span> <span class="s2">"url"</span>
<span class="n">CRAWLED_LIST</span> <span class="o">=</span> <span class="s2">"list"</span></div>
<span class="n">SITES_TOOL_NAME</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"webcrawl_sites"</span>
<span class="n">SITES_FIELDS_BASE</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"name"</span><span class="p">,</span> <span class="s2">"type"</span><span class="p">,</span> <span class="s2">"urls"</span><span class="p">]</span>
<span class="n">SITES_FIELDS_DEFAULT</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">SITES_FIELDS_BASE</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"created"</span><span class="p">,</span> <span class="s2">"modified"</span><span class="p">]</span>
<div class="viewcode-block" id="SiteResult">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult">[docs]</a>
<span class="k">class</span> <span class="nc">SiteResult</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Represents a website or crawl directory result.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="SiteResult.__init__">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult.__init__">[docs]</a>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="nb">id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="nb">type</span><span class="p">:</span> <span class="n">SiteType</span> <span class="o">=</span> <span class="n">SiteType</span><span class="o">.</span><span class="n">CRAWLED_URL</span><span class="p">,</span>
<span class="n">urls</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Path</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">created</span><span class="p">:</span> <span class="n">datetime</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">modified</span><span class="p">:</span> <span class="n">datetime</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">robots</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">metadata</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Initialize a SiteResult instance.</span>
<span class="sd"> Args:</span>
<span class="sd"> id: site identifier</span>
<span class="sd"> name: site name, either a URL or a custom job</span>
<span class="sd"> urls: site URL(s), multiple for list type crawls</span>
<span class="sd"> path: path to site data, different from datasrc</span>
<span class="sd"> created: creation timestamp</span>
<span class="sd"> modified: last modification timestamp</span>
<span class="sd"> robots: robots.txt content</span>
<span class="sd"> metadata: additional metadata for the site</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">id</span> <span class="o">=</span> <span class="nb">id</span>
<span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">type</span> <span class="o">=</span> <span class="nb">type</span>
<span class="bp">self</span><span class="o">.</span><span class="n">urls</span> <span class="o">=</span> <span class="n">urls</span>
<span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">created</span> <span class="o">=</span> <span class="n">created</span>
<span class="bp">self</span><span class="o">.</span><span class="n">modified</span> <span class="o">=</span> <span class="n">modified</span>
<span class="bp">self</span><span class="o">.</span><span class="n">robots</span> <span class="o">=</span> <span class="n">robots</span>
<span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="o">=</span> <span class="n">metadata</span> <span class="ow">or</span> <span class="p">{}</span></div>
<div class="viewcode-block" id="SiteResult.to_dict">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult.to_dict">[docs]</a>
<span class="k">def</span> <span class="nf">to_dict</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Convert the object to a dictionary suitable for JSON serialization.</span>
<span class="sd"> """</span>
<span class="n">result</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">"id"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
<span class="s2">"name"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
<span class="s2">"type"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">value</span><span class="p">,</span>
<span class="s2">"urls"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">urls</span><span class="p">,</span>
<span class="s2">"created"</span><span class="p">:</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">created</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">created</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="s2">"modified"</span><span class="p">:</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">modified</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">modified</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="s2">"metadata"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">return</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">v</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="p">(</span><span class="n">k</span> <span class="o">==</span> <span class="s2">"metadata"</span> <span class="ow">and</span> <span class="n">v</span> <span class="o">==</span> <span class="p">{})}</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/warc/tests.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.warc.tests — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.warc.tests</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.warc.tests</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.crawler</span> <span class="kn">import</span> <span class="n">WarcCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.adapter</span> <span class="kn">import</span> <span class="n">WarcManager</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">EXAMPLE_WARC_ID</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">WarcManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"example.warc.gz"</span><span class="p">)</span>
<span class="n">PRAGMAR_WARC_ID</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">WarcManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"pragmar.warc.gz"</span><span class="p">)</span>
<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="WarcTests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests">[docs]</a>
<span class="k">class</span> <span class="nc">WarcTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test suite for the WARC crawler implementation.</span>
<span class="sd"> Uses all wrapped test methods from BaseCrawlerTests.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="WarcTests.setUp">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.setUp">[docs]</a>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Set up the test environment with fixture data.</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"warc"</span></div>
<div class="viewcode-block" id="WarcTests.test_warc_pulse">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_pulse">[docs]</a>
<span class="k">def</span> <span class="nf">test_warc_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test basic crawler initialization.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
<div class="viewcode-block" id="WarcTests.test_warc_sites">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sites">[docs]</a>
<span class="k">def</span> <span class="nf">test_warc_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test site retrieval API functionality.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="WarcTests.test_warc_search">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_search">[docs]</a>
<span class="k">def</span> <span class="nf">test_warc_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test boolean search functionality</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="WarcTests.test_warc_resources">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_resources">[docs]</a>
<span class="k">def</span> <span class="nf">test_warc_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test resource retrieval API functionality with various parameters.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="n">EXAMPLE_WARC_ID</span><span class="p">)</span></div>
<span class="c1"># pragmar WARC fixture legit contains no images</span>
<span class="c1"># may be default behavior of wget WARC gen, not sure</span>
<span class="c1"># this is a blind spot</span>
<span class="c1"># def test_interrobot_images(self):</span>
<span class="c1"># """</span>
<span class="c1"># Test InterroBot-specific image handling and thumbnails.</span>
<span class="c1"># """</span>
<span class="c1"># crawler = WarcCrawler(self._datasrc)</span>
<span class="c1"># self.run_pragmar_image_tests(crawler, PRAGMAR_WARC_ID)</span>
<div class="viewcode-block" id="WarcTests.test_warc_sorts">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sorts">[docs]</a>
<span class="k">def</span> <span class="nf">test_warc_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test random sort functionality using the '?' sort parameter.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="WarcTests.test_warc_content_parsing">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_content_parsing">[docs]</a>
<span class="k">def</span> <span class="nf">test_warc_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test content type detection and parsing for WARC files.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div>
<div class="viewcode-block" id="WarcTests.test_report">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_report">[docs]</a>
<span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Run test report, save to data directory.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="s2">"WARC"</span><span class="p">))</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/logger.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.utils.logger — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.utils.logger</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.utils.logger</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">logging</span>
<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
<span class="kn">import</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="k">as</span> <span class="nn">settings</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="kn">import</span> <span class="n">DEBUG</span><span class="p">,</span> <span class="n">DATA_DIRECTORY</span>
<span class="n">DEFAULT_LOG_KEY</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"mcp-server-webcrawl"</span>
<span class="n">DEFAULT_LOG_PATH</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="n">DATA_DIRECTORY</span> <span class="o">/</span> <span class="s2">"mcp-server-webcrawl.log"</span>
<span class="n">DEFAULT_LOG_LEVEL</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">WARNING</span>
<div class="viewcode-block" id="get_logger_configuration">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.get_logger_configuration">[docs]</a>
<span class="k">def</span> <span class="nf">get_logger_configuration</span><span class="p">()</span> <span class="o">-></span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Get log name, path, and level (in that order)</span>
<span class="sd"> Returns:</span>
<span class="sd"> tuple[str, Path, int]: A tuple containing name, path, and level</span>
<span class="sd"> """</span>
<span class="n">log_level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">DEBUG</span> <span class="k">if</span> <span class="n">DEBUG</span> <span class="k">else</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">settings</span><span class="p">,</span> <span class="s2">"LOG_LEVEL"</span><span class="p">,</span> <span class="n">DEFAULT_LOG_LEVEL</span><span class="p">)</span>
<span class="n">log_path</span><span class="p">:</span> <span class="n">Path</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">settings</span><span class="p">,</span> <span class="s2">"LOG_PATH"</span><span class="p">,</span> <span class="n">DEFAULT_LOG_PATH</span><span class="p">)</span>
<span class="k">return</span> <span class="p">(</span><span class="n">DEFAULT_LOG_KEY</span><span class="p">,</span> <span class="n">log_path</span><span class="p">,</span> <span class="n">log_level</span><span class="p">)</span></div>
<div class="viewcode-block" id="get_logger">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.get_logger">[docs]</a>
<span class="k">def</span> <span class="nf">get_logger</span><span class="p">()</span> <span class="o">-></span> <span class="n">logging</span><span class="o">.</span><span class="n">Logger</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Get logger, usually in order to write to it</span>
<span class="sd"> Returns:</span>
<span class="sd"> Logger: a writable logging object (error/warn/info/debug)</span>
<span class="sd"> """</span>
<span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">)</span> <span class="o">=</span> <span class="n">get_logger_configuration</span><span class="p">()</span>
<span class="k">return</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">log_name</span><span class="p">)</span></div>
<div class="viewcode-block" id="initialize_logger">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.initialize_logger">[docs]</a>
<span class="k">def</span> <span class="nf">initialize_logger</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Validate and set up logger for writing</span>
<span class="sd"> Returns:</span>
<span class="sd"> None</span>
<span class="sd"> """</span>
<span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="n">log_path</span><span class="p">,</span> <span class="n">log_level</span><span class="p">)</span> <span class="o">=</span> <span class="n">get_logger_configuration</span><span class="p">()</span>
<span class="k">if</span> <span class="n">log_level</span> <span class="o">==</span> <span class="n">logging</span><span class="o">.</span><span class="n">NOTSET</span><span class="p">:</span>
<span class="c1"># don't set up anything, named logging will effectively evaporate</span>
<span class="k">return</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_level</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">and</span> <span class="n">log_level</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"LOG_LEVEL must be set"</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_path</span><span class="p">,</span> <span class="n">Path</span><span class="p">),</span> <span class="s2">"LOG_PATH must be a Path object"</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="n">log_name</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="o">!=</span> <span class="s2">""</span><span class="p">,</span> <span class="s2">"LOG_NAME must be a non-empty string"</span>
<span class="k">assert</span> <span class="nb">all</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">isalpha</span><span class="p">()</span> <span class="ow">or</span> <span class="n">c</span> <span class="ow">in</span> <span class="s2">"-_"</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">log_name</span><span class="p">),</span> <span class="s2">"LOG_NAME must contain only A-Z, a-z, hyphens, and underscores"</span>
<span class="c1"># handle custom log paths differently, don't generate directories</span>
<span class="k">if</span> <span class="s2">".mcp_server_webcrawl"</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">log_path</span><span class="p">):</span>
<span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">exists</span><span class="p">()</span> <span class="ow">and</span> <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> \
<span class="sa">f</span><span class="s2">"Custom parent directory `</span><span class="si">{</span><span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="si">}</span><span class="s2">` does not exist or is not a directory"</span>
<span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">filename</span><span class="o">=</span><span class="nb">str</span><span class="p">(</span><span class="n">log_path</span><span class="p">),</span> <span class="n">filemode</span><span class="o">=</span><span class="s2">"w"</span><span class="p">,</span> <span class="n">level</span><span class="o">=</span><span class="n">log_level</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="s2">"</span><span class="si">%(asctime)s</span><span class="s2"> - </span><span class="si">%(name)s</span><span class="s2"> - </span><span class="si">%(levelname)s</span><span class="s2"> - </span><span class="si">%(message)s</span><span class="s2">"</span><span class="p">,</span>
<span class="n">datefmt</span><span class="o">=</span><span class="s2">"%Y-%m-</span><span class="si">%d</span><span class="s2"> %H:%M:%S"</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">"utf-8"</span><span class="p">)</span>
<span class="n">logger</span><span class="p">:</span> <span class="n">logging</span><span class="o">.</span><span class="n">Logger</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">log_name</span><span class="p">)</span>
<span class="c1"># just set a few ops back, concurrent logger might not be ready</span>
<span class="k">if</span> <span class="n">log_level</span> <span class="o"><=</span> <span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"🖥️ starting webcrawl MCP server"</span><span class="p">)</span>
<span class="n">log_extra</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"(Debug is True)"</span> <span class="k">if</span> <span class="n">DEBUG</span> <span class="k">else</span> <span class="s2">""</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">"log level set to </span><span class="si">{</span><span class="n">log_level</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">log_extra</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/querycache.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.utils.querycache — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.utils.querycache</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.utils.querycache</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">hashlib</span>
<span class="kn">import</span> <span class="nn">time</span>
<div class="viewcode-block" id="QueryCountCache">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache">[docs]</a>
<span class="k">class</span> <span class="nc">QueryCountCache</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> A cache for storing total count results from database queries.</span>
<span class="sd"> Only caches the count integer values, as these are reusable and light.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="QueryCountCache.__init__">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.__init__">[docs]</a>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">max</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">250</span><span class="p">,</span> <span class="n">ttl</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">900</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Initialize the query count cache.</span>
<span class="sd"> Parameters:</span>
<span class="sd"> max: Maximum number of entries to store in the cache</span>
<span class="sd"> ttl: Time-to-live for cache entries in seconds</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_max</span> <span class="o">=</span> <span class="nb">max</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_ttl</span> <span class="o">=</span> <span class="n">ttl</span></div>
<span class="k">def</span> <span class="nf">_hash_query</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Generate a hash key from a query statement and parameters.</span>
<span class="sd"> Parameters:</span>
<span class="sd"> statement: SQL statement</span>
<span class="sd"> params: Query parameters</span>
<span class="sd"> Returns:</span>
<span class="sd"> MD5 hash of the combined query string</span>
<span class="sd"> """</span>
<span class="n">query</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">statement</span><span class="si">}</span><span class="s2">:</span><span class="si">{</span><span class="nb">str</span><span class="p">(</span><span class="n">params</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
<span class="k">return</span> <span class="n">hashlib</span><span class="o">.</span><span class="n">md5</span><span class="p">(</span><span class="n">query</span><span class="o">.</span><span class="n">encode</span><span class="p">())</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()</span>
<div class="viewcode-block" id="QueryCountCache.get">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.get">[docs]</a>
<span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="nb">int</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Get a cached count result if available and not expired.</span>
<span class="sd"> Parameters:</span>
<span class="sd"> statement: SQL statement</span>
<span class="sd"> params: Query parameters</span>
<span class="sd"> Returns:</span>
<span class="sd"> Cached count value or None if not found or expired</span>
<span class="sd"> """</span>
<span class="n">key</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hash_query</span><span class="p">(</span><span class="n">statement</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span>
<span class="k">if</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">_ttl</span><span class="p">:</span>
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
<span class="k">return</span> <span class="kc">None</span></div>
<div class="viewcode-block" id="QueryCountCache.set">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.set">[docs]</a>
<span class="k">def</span> <span class="nf">set</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">],</span> <span class="n">count</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Store a count result in the cache.</span>
<span class="sd"> Parameters:</span>
<span class="sd"> statement: SQL statement</span>
<span class="sd"> params: Query parameters</span>
<span class="sd"> count: Count value to cache</span>
<span class="sd"> """</span>
<span class="n">key</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hash_query</span><span class="p">(</span><span class="n">statement</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
<span class="c1"># if cache is full, remove oldest entry</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">)</span> <span class="o">>=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_max</span> <span class="ow">and</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span>
<span class="n">oldest_key</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">k</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">k</span><span class="p">])</span>
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">oldest_key</span><span class="p">]</span>
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">oldest_key</span><span class="p">]</span>
<span class="c1"># store new entry</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">count</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span></div>
<div class="viewcode-block" id="QueryCountCache.clear">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.clear">[docs]</a>
<span class="k">def</span> <span class="nf">clear</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Clear all entries from the cache.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/guides/katana.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Katana MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="SiteOne MCP Setup Guide" href="siteone.html" />
<link rel="prev" title="InterroBot MCP Setup Guide" href="interrobot.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">Katana MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/katana.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="katana-mcp-setup-guide">
<h1>Katana MCP Setup Guide<a class="headerlink" href="#katana-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with <a class="reference external" href="https://github.com/projectdiscovery/katana">Katana</a> crawler.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve crawled using Katana.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/sOMaojm0R0Y" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p><a class="reference external" href="https://go.dev/doc/install">Go programming language</a> installed</p></li>
<li><p><a class="reference external" href="https://github.com/projectdiscovery/katana">Katana crawler</a> installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
</ul>
</section>
<section id="what-is-katana">
<h2>What is Katana?<a class="headerlink" href="#what-is-katana" title="Link to this heading"></a></h2>
<p>Katana is an open-source web crawler from Project Discovery that offers:</p>
<ul class="simple">
<li><p>Fast and efficient web crawling capabilities</p></li>
<li><p>Command-line interface for flexibility and automation</p></li>
<li><p>Highly configurable crawling parameters</p></li>
<li><p>Ability to store complete HTTP responses for analysis</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
</pre></div>
</div>
<p>Verify installation was successful:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
</section>
<section id="install-and-run-katana">
<h3>2. Install and Run Katana<a class="headerlink" href="#install-and-run-katana" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>Verify Go is installed and on your PATH:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">go</span> <span class="n">version</span>
</pre></div>
</div>
</li>
<li><p>Install Katana using Go:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">go</span> <span class="n">install</span> <span class="n">github</span><span class="o">.</span><span class="n">com</span><span class="o">/</span><span class="n">projectdiscovery</span><span class="o">/</span><span class="n">katana</span><span class="o">/</span><span class="n">cmd</span><span class="o">/</span><span class="n">katana</span><span class="nd">@latest</span>
</pre></div>
</div>
</li>
<li><p>Create a directory for your crawls and run Katana with storage options:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create a directory for storing crawls</span>
<span class="n">mkdir</span> <span class="n">crawls</span>
<span class="c1"># Run Katana with storage options</span>
<span class="n">katana</span> <span class="o">-</span><span class="n">u</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">example</span><span class="o">.</span><span class="n">com</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span><span class="o">-</span><span class="nb">dir</span> <span class="n">archives</span><span class="o">/</span><span class="n">example</span><span class="o">.</span><span class="n">com</span><span class="o">/</span>
</pre></div>
</div>
</li>
<li><p>Repeat for additional websites as needed:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">katana</span> <span class="o">-</span><span class="n">u</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">pragmar</span><span class="o">.</span><span class="n">com</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span> <span class="o">-</span><span class="n">store</span><span class="o">-</span><span class="n">response</span><span class="o">-</span><span class="nb">dir</span> <span class="n">archives</span><span class="o">/</span><span class="n">pragmar</span><span class="o">.</span><span class="n">com</span><span class="o">/</span>
</pre></div>
</div>
</li>
</ol>
<p>In this case, the ./archives directory is the datasrc. The crawler will create
a separate host directory for each unique host within
the specified directory. This is consistent with the behavior of Katana,
example.com/example.com is expected. Sites with external dependencies will branch
out by origin host in the -store-response-dir, and continue to be searchable as a
singular site search.</p>
</section>
<section id="configure-claude-desktop">
<h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"katana"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"/path/to/katana/crawls/"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
<li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
<li><p>Change <code class="docutils literal notranslate"><span class="pre">/path/to/katana/crawls/</span></code> to the actual path where you stored your Katana crawls</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
<li><p>Ask Claude to list your crawled sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
</li>
<li><p>Try searching content from your crawls:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
</pre></div>
</div>
</li>
<li><p>Try specialized searches that use Katana’s comprehensive data collection:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Can</span> <span class="n">you</span> <span class="n">find</span> <span class="nb">all</span> <span class="n">the</span> <span class="n">help</span> <span class="n">pages</span> <span class="n">on</span> <span class="n">this</span> <span class="n">site</span> <span class="ow">and</span> <span class="n">tell</span> <span class="n">me</span> <span class="n">how</span> <span class="n">they</span><span class="s1">'re different?</span>
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
<li><p>Check that your Katana crawls directory path in the configuration is correct</p></li>
<li><p>Make sure the <code class="docutils literal notranslate"><span class="pre">-store-response</span></code> flag was used during crawling, as this is required to save content</p></li>
<li><p>Verify that each crawl completed successfully and files were saved to the expected location</p></li>
<li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
</ul>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="interrobot.html" class="btn btn-neutral float-left" title="InterroBot MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="siteone.html" class="btn btn-neutral float-right" title="SiteOne MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/guides/archivebox.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>ArchiveBox MCP Setup Guide — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="HTTrack MCP Setup Guide" href="httrack.html" />
<link rel="prev" title="Setup Guides" href="../guides.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
<li class="toctree-l2 current"><a class="current reference internal" href="#">ArchiveBox MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="wget.html">wget MCP Setup Guide</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
<li class="breadcrumb-item active">ArchiveBox MCP Setup Guide</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/guides/archivebox.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="archivebox-mcp-setup-guide">
<h1>ArchiveBox MCP Setup Guide<a class="headerlink" href="#archivebox-mcp-setup-guide" title="Link to this heading"></a></h1>
<p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with <a class="reference external" href="https://archivebox.io/">ArchiveBox</a>.
This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve archived using ArchiveBox.</p>
<iframe width="560" height="315" src="https://www.youtube.com/embed/0KFqhSYf3f4" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide below.</p>
<section id="requirements">
<h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
<p>Before you begin, ensure you have:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
<li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
<li><p><a class="reference external" href="https://archivebox.io/">ArchiveBox</a> installed</p></li>
<li><p>Basic familiarity with command line interfaces</p></li>
</ul>
</section>
<section id="what-is-archivebox">
<h2>What is ArchiveBox?<a class="headerlink" href="#what-is-archivebox" title="Link to this heading"></a></h2>
<p>ArchiveBox is a powerful open-source web archiving solution that offers:</p>
<ul class="simple">
<li><p>Multiple output formats (HTML, PDF, screenshots, WARC, etc.)</p></li>
<li><p>Comprehensive metadata</p></li>
<li><p>CLI + webadmin for browsing and managing archives</p></li>
<li><p>Support for various input sources (URLs, browser bookmarks, RSS feeds)</p></li>
<li><p>Self-hosted solution for long-term web content preservation</p></li>
</ul>
</section>
<section id="installation-steps">
<h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
<section id="install-mcp-server-webcrawl">
<h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
<p>Open your terminal or command line and install the package:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span>
</pre></div>
</div>
<p>Verify installation was successful:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">mcp</span><span class="o">-</span><span class="n">server</span><span class="o">-</span><span class="n">webcrawl</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
</section>
<section id="install-and-set-up-archivebox">
<h3>2. Install and Set Up ArchiveBox<a class="headerlink" href="#install-and-set-up-archivebox" title="Link to this heading"></a></h3>
<p>macOS/Linux only, Windows may work under Docker but is untested.</p>
<ol class="arabic">
<li><p>Install ArchiveBox (macOS/Linux):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">archivebox</span>
</pre></div>
</div>
</li>
<li><p>macOS only, install brew and wget:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">brew</span> <span class="n">install</span> <span class="n">wget</span>
</pre></div>
</div>
</li>
<li><p>Create ArchiveBox collections. Unlike other crawlers that focus on single websites, ArchiveBox uses a collection-based approach where each collection can contain multiple URLs. You can create separate content for different projects or group related URLs together:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create a directory structure for your collections</span>
<span class="n">mkdir</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span>
<span class="c1"># Create an "example" collection</span>
<span class="n">mkdir</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">example</span>
<span class="n">cd</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">example</span>
<span class="n">archivebox</span> <span class="n">init</span>
<span class="n">archivebox</span> <span class="n">add</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">example</span><span class="o">.</span><span class="n">com</span>
<span class="c1"># Create a "pragmar" collection</span>
<span class="n">mkdir</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">pragmar</span>
<span class="n">cd</span> <span class="o">~/</span><span class="n">archivebox</span><span class="o">-</span><span class="n">data</span><span class="o">/</span><span class="n">pragmar</span>
<span class="n">archivebox</span> <span class="n">init</span>
<span class="n">archivebox</span> <span class="n">add</span> <span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">pragmar</span><span class="o">.</span><span class="n">com</span>
</pre></div>
</div>
</li>
<li><p>Each <code class="docutils literal notranslate"><span class="pre">archivebox</span> <span class="pre">init</span></code> creates a complete ArchiveBox instance with its own database and archive directory structure. The typical structure includes:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>collection-name/
├── archive/ # Archived content organized by timestamp
├── logs/ # ArchiveBox operation logs
├── sources/ # Source URL lists and metadata
└── index.sqlite3 # Database containing all metadata
</pre></div>
</div>
</li>
</ol>
</section>
<section id="configure-claude-desktop">
<h3>3. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Open Claude Desktop</p></li>
<li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
<li><p>Add the following configuration (modify paths as needed):</p></li>
</ol>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
<span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"archivebox"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
<span class="w"> </span><span class="s2">"/path/to/archivebox-data/"</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<ul class="simple">
<li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
<li><p>On macOS/Linux, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
<li><p>The datasrc path should point to the parent directory containing your ArchiveBox collections (e.g., <code class="docutils literal notranslate"><span class="pre">~/archivebox-data/</span></code>), not to individual collection directories</p></li>
<li><p>Each collection directory (example, pragmar, etc.) will appear as a separate “site” in MCP</p></li>
</ul>
</div>
<ol class="arabic simple" start="4">
<li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
<li><p>Restart Claude Desktop</p></li>
</ol>
</section>
<section id="verify-and-use">
<h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
<ol class="arabic">
<li><p>In Claude Desktop, you should now see MCP tools available under Search and Tools</p></li>
<li><p>Ask Claude to list your archived sites:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
</pre></div>
</div>
</li>
<li><p>Try searching content from your archives:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [archived site]?
</pre></div>
</div>
</li>
<li><p>Use the rich metadata for content discovery:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Can you find all the archived pages related to [keyword] from [archive]?
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="troubleshooting">
<h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
<ul class="simple">
<li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
<li><p>Ensure Python and mcp-server-webcrawl are properly installed</p></li>
<li><p>Check that your ArchiveBox archive directory path in the configuration is correct</p></li>
<li><p>Make sure ArchiveBox has successfully archived the websites and created the database</p></li>
<li><p>Verify that files exist in your archive/[timestamp] directories</p></li>
<li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
<li><p>For large archives, initial indexing may take some time during the first search</p></li>
</ul>
<p>ArchiveBox’s comprehensive archiving capabilities combined with mcp-server-webcrawl provide powerful tools for content preservation, research, and analysis across your archived web content.</p>
<p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="../guides.html" class="btn btn-neutral float-left" title="Setup Guides" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="httrack.html" class="btn btn-neutral float-right" title="HTTrack MCP Setup Guide" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```