This is page 6 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=false&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/search.py:
--------------------------------------------------------------------------------
```python
from logging import Logger
from typing import Any
from mcp_server_webcrawl.utils.logger import get_logger
from mcp_server_webcrawl.utils.parser import SearchLexer, SearchParser, SearchSubquery
# url is technically fts but handled differently, uses LIKE; without type in
# fts field mode, the "A long chained OR should not return all results" fails
FTS5_MATCH_FIELDS: list[str] = ["type", "headers", "content"]
logger: Logger = get_logger()
class ParameterManager:
"""
Helper class to manage SQL parameter naming and counting.
"""
def __init__(self):
self.params: dict[str, str | int | float] = {}
self.counter: int = 0
def add_param(self, value: str | int | float) -> str:
"""
Add a parameter and return its name.
"""
assert isinstance(value, (str, int, float)), f"Parameter value must be str, int, or float."
param_name: str = f"query{self.counter}"
self.params[param_name] = value
self.counter += 1
return param_name
def get_params(self) -> dict[str, str | int | float]:
"""
Get all accumulated parameters.
"""
return self.params
class SearchQueryParser:
"""
Implementation of ply lexer to capture field-expanded boolean queries.
"""
def __init__(self):
self.lexer: SearchLexer = SearchLexer()
self.parser: SearchParser = SearchParser(self.lexer)
def get_fulltext_terms(self, query: str) -> list[str]:
"""
Extract fulltext search terms from a query string.
Returns list of search terms suitable for snippet extraction.
"""
parsed_query: list[SearchSubquery] = self.parse(query)
search_terms: list[str] = []
fulltext_fields: tuple[str | None, ...] = ("content", "headers", "fulltext", "", None)
# prepare for match, lowercase, and eliminate wildcards
for subquery in parsed_query:
if subquery.field in fulltext_fields:
term: str = str(subquery.value).lower().strip("*")
if term:
search_terms.append(term)
return search_terms
def parse(self, query_string: str) -> list[SearchSubquery]:
"""
Parse a query string into a list of SearchSubquery instances
"""
result: SearchSubquery | list[SearchSubquery] = self.parser.parser.parse(query_string, lexer=self.lexer.lexer)
if isinstance(result, SearchSubquery):
return [result]
elif isinstance(result, list) and all(isinstance(item, SearchSubquery) for item in result):
return result
else:
return []
def to_sqlite_fts(
self,
parsed_query: list[SearchSubquery],
swap_values: dict[str, dict[str, str | int]] = {}
) -> tuple[list[str], dict[str, str | int]]:
"""
Convert the parsed query to SQLite FTS5 compatible WHERE clause components.
Returns a tuple of (query_parts, params) where query_parts is a list of SQL
conditions and params is a dictionary of parameter values with named parameters.
"""
query_parts: list[str] = []
param_manager: ParameterManager = ParameterManager()
current_index: int = 0
while current_index < len(parsed_query):
subquery: SearchSubquery = parsed_query[current_index]
# fts vs pure sql is handled differently
if not subquery.field or subquery.field in FTS5_MATCH_FIELDS:
# check if previous subquery targeted this FTS field with NOT
previous_subquery: SearchSubquery | None = parsed_query[current_index - 1] if current_index > 0 else None
has_unary_not: bool = "NOT" in subquery.modifiers
has_binary_not: bool = previous_subquery and previous_subquery.operator == "NOT"
should_negate: bool = has_unary_not or has_binary_not
# group consecutive fulltext terms with their operators
fts_field_query: dict[str, str | int] = self.__build_fts_field_subquery(parsed_query, subquery.field, current_index, swap_values)
if fts_field_query["querystring"]:
param_name: str = param_manager.add_param(fts_field_query["querystring"])
field_name: str = "fulltext" if subquery.field is None else subquery.field
safe_sql_field: str = subquery.get_safe_sql_field(field_name)
# handle NOT with subquery to avoid JOIN issues
if should_negate:
# generate subquery exclusion pattern to avoid JOIN + NOT (MATCH) issues
sql_part: str = f"ResourcesFullText.Id NOT IN (SELECT Id FROM ResourcesFullText WHERE {safe_sql_field} MATCH :{param_name})"
else:
sql_part: str = f"{safe_sql_field} MATCH :{param_name}"
query_parts.append(sql_part)
current_index = fts_field_query["next_index"]
else:
# handle field searches
sql_part: str = ""
field: str = subquery.field
processed_value: str | int | float = self.__process_field_value(field, subquery.value, swap_values)
value_type: str = subquery.type
modifiers: list[str] = subquery.modifiers
# check if prior subquery targeted this with NOT
previous_subquery: SearchSubquery | None = parsed_query[current_index - 1] if current_index > 0 else None
# NOT modifier if present
if "NOT" in modifiers:
sql_part += "NOT "
elif previous_subquery and previous_subquery.operator == "NOT":
sql_part += "NOT "
safe_sql_field: str = subquery.get_safe_sql_field(field)
if field in self.parser.numeric_fields:
param_name: str = param_manager.add_param(processed_value)
sql_part += f"{safe_sql_field} {subquery.comparator} :{param_name}"
else:
# headers currently handled FTS5_MATCH_FIELDS handler
if field == "url":
# Use LIKE for certain field searches instead of MATCH, maximize the hits
# with %LIKE%. Think of https://example.com/logo.png?cache=20250112
# and a search of url: *.png and the 10s of ways broader match is better
# fit for intention
sql_part += f"{safe_sql_field} LIKE :"
trimmed_url: str = str(processed_value).strip("*\"'`")
param_name: str = param_manager.add_param(f"%{trimmed_url}%")
sql_part += param_name
elif value_type == "phrase":
formatted_term: str = self.__format_search_term(processed_value, value_type)
param_name: str = param_manager.add_param(formatted_term)
sql_part += f"{safe_sql_field} MATCH :{param_name}"
else:
# default fts query
param_name: str = param_manager.add_param(processed_value)
safe_sql_field: str = subquery.get_safe_sql_field("fulltext")
sql_part += f"{safe_sql_field} MATCH :{param_name}"
query_parts.append(sql_part)
current_index += 1
# add operator between clauses
if current_index < len(parsed_query):
# look at the previous subquery's operator to determine how to connect
previous_subquery: SearchSubquery | None = parsed_query[current_index - 1] if current_index > 0 else None
if previous_subquery and previous_subquery.operator:
# skip NOT - it will be handled by the next clause
# sqlite doesn't support interclause NOT, errors/0 results
# AND NOT is the way (FTS is different)
op: str = previous_subquery.operator if previous_subquery.operator != "NOT" else "AND"
else:
op: str = "AND" # default
query_parts.append(op)
return query_parts, param_manager.get_params()
def __build_fts_field_subquery(
self,
parsed_query: list[SearchSubquery],
field: str | None,
start_index: int,
swap_values: dict[str, dict[str, str | int]] = {}
) -> dict[str, str | int]:
"""
The rule is one MATCH per column for fts5, so multiple pure booleans are compressed
into thier own little querystring, attempting to preserve the Boolean intent of the
original SearchSubquery substructure. There are complexity limits here. Group IDs
preserve the parenthetical home of each SearchSubquery, None if not in parens.
"""
current_index: int = start_index
# this modifies subqueries in place, prevents fts conversion leaking
parsed_query: list[SearchSubquery] = self.__normalize_fts_match_operators(parsed_query)
# determine the condition for continuing the loop based on field type
def continue_sequencing(subquery_field: str | None) -> bool:
return subquery_field is None if field is None else subquery_field == field
# group consecutive, group is None unless parenthetical (A OR B)
groups: list[tuple[Any, list[tuple[str, str | None]]]] = []
current_group: list[tuple[str, str | None]] = []
current_group_id: Any = None
while current_index < len(parsed_query) and continue_sequencing(parsed_query[current_index].field):
subquery: SearchSubquery = parsed_query[current_index]
# new group
if subquery.group != current_group_id:
if current_group:
groups.append((current_group_id, current_group))
current_group = []
current_group_id = subquery.group
processed_value: str | int | float = self.__process_field_value(field, subquery.value, swap_values)
formatted_term: str = self.__format_search_term(processed_value, subquery.type, subquery.modifiers)
current_group.append((formatted_term, subquery.operator))
current_index += 1
# last group
if current_group:
groups.append((current_group_id, current_group))
# build query string with parentheses for grouped terms
query_parts: list[str] = []
for group_id, group_terms in groups:
if group_id is not None and len(group_terms) > 1:
# multiple terms in a group, add parentheses
group_str: str = ""
for i, (term, operator) in enumerate(group_terms):
group_str += term
if operator and i < len(group_terms) - 1:
group_str += f" {operator} "
query_parts.append(f"({group_str})")
else:
# single term or ungrouped, no parentheses
for i, (term, operator) in enumerate(group_terms):
query_parts.append(term)
if operator and i < len(group_terms) - 1:
query_parts.append(operator)
# add inter-group operator (from last term in previous group)
if groups.index((group_id, group_terms)) < len(groups) - 1:
last_term: tuple[str, str | None] = group_terms[-1]
if last_term[1]: # operator exists
query_parts.append(last_term[1])
querystring: str = " ".join(query_parts)
return {
"querystring": querystring,
"next_index": current_index
}
def __format_search_term(
self,
value: str | int | float,
value_type: str,
modifiers: list[str] | None = None
) -> str:
"""
Format a fulltext search term based on type and modifiers. This takes some
of the sharp edges of the secondary fts5 parser in conversion.
Args:
value: The search value
value_type: Type of value ('term', 'phrase', 'wildcard')
modifiers: List of modifiers (e.g., ['NOT'])
Returns:
Formatted search term string
"""
modifiers: list[str] = modifiers or []
value_string: str = str(value)
if value_type == "phrase":
return f'"{value_string}"'
elif value_type == "wildcard":
# for wildcards, only quote if contains hyphens/spaces require it
if "-" in value_string or " " in value_string:
return f'"{value_string}"*'
else:
return f"{value_string}*"
else:
# for terms like one-click etc.
# avoid confusing the secondary fts parser
# where hyphens in unquoted matches can be confused for
# fts negation (-term)
if '-' in value_string:
return f'"{value_string}"'
else:
return value_string
def __normalize_fts_match_operators(self, parsed_query: list[SearchSubquery]) -> list[SearchSubquery]:
"""
Clean up operators on fulltext sequences so they don't leak into interclause SQL
Why? ONE MATCH per column. the SearchSubquery sharing a fts field must be compressed
into a single MATCH. If the next clause does not share the same field as current, it
requires an operator set to None so as not to leak into the next field. Basically,
this firewalls boolean logic for combined fts subqueries. The flagship error of not
doing this is to have "this OR that OR there" return unfiltered or 0 results instead
of the appropriate number. (unfiltered in the case of a leaky OR status: >=100, which
instead of defining the result should limit it)
NOT operations must NEVER be cleared because they always represent
separate SQL exclusion clauses. Only clear AND/OR when they would cause leakage.
"""
for i in range(len(parsed_query) - 1):
current: SearchSubquery = parsed_query[i]
next_item: SearchSubquery = parsed_query[i + 1]
# never clear NOT operators, they need separate SQL clauses for exclusion
if current.operator == "NOT":
continue
# only clear AND/OR operators in transitions that would cause SQL leakage
# while preserving legitimate inter-clause boolean operations
# clear when transitioning from fulltext to non-FTS field
if (current.field is None and
next_item.field is not None and
next_item.field not in FTS5_MATCH_FIELDS):
current.operator = None
# clear when transitioning from FTS field to non-FTS field
elif (current.field in FTS5_MATCH_FIELDS and
next_item.field is not None and
next_item.field not in FTS5_MATCH_FIELDS):
current.operator = None
return parsed_query
def __process_field_value(
self,
field: str | None,
value_dict: dict[str, str] | str | int,
swap_values: dict[str, dict[str, str | int]] | None = None
) -> str | int | float:
"""
Process and validate a field value with type conversion and swapping.
Args:
field: The field name (or None for fulltext)
value_dict: Dictionary with 'value' and 'type' keys, or raw value
swap_values: Optional dictionary for value replacement
Returns:
Processed value (string, int, or float)
"""
if isinstance(value_dict, dict):
value: str | int = value_dict["value"]
else:
value: str | int = value_dict # raw value
if swap_values:
swap_key: str = field if field else ""
if swap_key in swap_values and value in swap_values[swap_key]:
value = swap_values[swap_key][value]
if field and field in self.parser.numeric_fields:
try:
return int(value)
except ValueError:
try:
return float(value)
except ValueError:
raise ValueError(f"Field {field} requires a numeric value, got: {value}")
return value
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.utils.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.utils package — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="prev" title="mcp_server_webcrawl.templates package" href="mcp_server_webcrawl.templates.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.utils package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/mcp_server_webcrawl.utils.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl-utils-package">
<h1>mcp_server_webcrawl.utils package<a class="headerlink" href="#mcp-server-webcrawl-utils-package" title="Link to this heading"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl.utils.cli">
<span id="mcp-server-webcrawl-utils-cli-module"></span><h2>mcp_server_webcrawl.utils.cli module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.cli" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.cli.get_help_short_message">
<span class="sig-name descname"><span class="pre">get_help_short_message</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">version</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/cli.html#get_help_short_message"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.cli.get_help_short_message" title="Link to this definition"></a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>version</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.cli.get_help_long_message">
<span class="sig-name descname"><span class="pre">get_help_long_message</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">version</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/cli.html#get_help_long_message"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.cli.get_help_long_message" title="Link to this definition"></a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>version</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a></p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.utils.logger">
<span id="mcp-server-webcrawl-utils-logger-module"></span><h2>mcp_server_webcrawl.utils.logger module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.logger" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.logger.get_logger_configuration">
<span class="sig-name descname"><span class="pre">get_logger_configuration</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/logger.html#get_logger_configuration"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.logger.get_logger_configuration" title="Link to this definition"></a></dt>
<dd><p>Get log name, path, and level (in that order)</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>A tuple containing name, path, and level</p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.14)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a>, Path, <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.14)">int</a>]</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.logger.get_logger">
<span class="sig-name descname"><span class="pre">get_logger</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/logger.html#get_logger"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.logger.get_logger" title="Link to this definition"></a></dt>
<dd><p>Get logger, usually in order to write to it</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>a writable logging object (error/warn/info/debug)</p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p>Logger</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.logger.initialize_logger">
<span class="sig-name descname"><span class="pre">initialize_logger</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/logger.html#initialize_logger"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.logger.initialize_logger" title="Link to this definition"></a></dt>
<dd><p>Validate and set up logger for writing</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p>None</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.utils.server">
<span id="mcp-server-webcrawl-utils-server-module"></span><h2>mcp_server_webcrawl.utils.server module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.server" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.server.initialize_mcp_server">
<span class="sig-name descname"><span class="pre">initialize_mcp_server</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/server.html#initialize_mcp_server"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.server.initialize_mcp_server" title="Link to this definition"></a></dt>
<dd><p>MCP stdio streams require utf-8 explicitly set for Windows (default cp1252)
or internationalized content will fail.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.utils.tools">
<span id="mcp-server-webcrawl-utils-tools-module"></span><h2>mcp_server_webcrawl.utils.tools module<a class="headerlink" href="#module-mcp_server_webcrawl.utils.tools" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.tools.get_crawler_tools">
<span class="sig-name descname"><span class="pre">get_crawler_tools</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sites</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils/tools.html#get_crawler_tools"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.tools.get_crawler_tools" title="Link to this definition"></a></dt>
<dd><p>Generate crawler tools based on available sites.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>sites</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.14)"><em>list</em></a><em>[</em><a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site results to include in tool descriptions</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>List of Tool objects for sites and resources</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.utils">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.utils" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.to_isoformat_zulu">
<span class="sig-name descname"><span class="pre">to_isoformat_zulu</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dt</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils.html#to_isoformat_zulu"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.to_isoformat_zulu" title="Link to this definition"></a></dt>
<dd><p>Convert datetime to iso Z.</p>
<p>python<=3.10 struggles with Z and fractions of seconds, will
throw. smooth out the iso string, second precision isn’t key here</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>dt</strong> (<a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a>) – </p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.utils.from_isoformat_zulu">
<span class="sig-name descname"><span class="pre">from_isoformat_zulu</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dt_string</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/utils.html#from_isoformat_zulu"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.utils.from_isoformat_zulu" title="Link to this definition"></a></dt>
<dd><p>Convert ISO string to datetime.</p>
<p>python<=3.10 struggles with Z and fractions of seconds, will
throw. smooth out the iso string, second precision isn’t key here</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>dt_string</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a><em> | </em><em>None</em>) – </p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.datetime" title="(in Python v3.14)"><em>datetime</em></a></p>
</dd>
</dl>
</dd></dl>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="mcp_server_webcrawl.templates.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.templates package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/document.py:
--------------------------------------------------------------------------------
```python
import curses
import textwrap
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional
from mcp_server_webcrawl.interactive.ui import DocumentMode, ThemeDefinition, ViewBounds
from mcp_server_webcrawl.interactive.views.base import BaseCursesView, CONTENT_MARGIN
from mcp_server_webcrawl.interactive.highlights import HighlightProcessor, HighlightSpan
from mcp_server_webcrawl.models.resources import ResourceResult
from mcp_server_webcrawl.interactive.ui import safe_addstr
if TYPE_CHECKING:
from mcp_server_webcrawl.interactive.session import InteractiveSession
DOCUMENT_MODE_NEXT: dict[DocumentMode, DocumentMode] = {
DocumentMode.MARKDOWN: DocumentMode.RAW,
DocumentMode.RAW: DocumentMode.HEADERS,
DocumentMode.HEADERS: DocumentMode.MARKDOWN
}
@dataclass
class DocumentLineData:
"""
Container for processed document line data with highlights.
"""
original_line: str
clean_text: str
highlights: list[HighlightSpan]
class SearchDocumentView(BaseCursesView):
"""
Document viewer with markdown/raw/headers modes, scrolling support, and search highlighting.
"""
def __init__(self, session: 'InteractiveSession'):
"""
Initialize the document view.
Args:
session: The interactive session instance
"""
super().__init__(session)
self.__document: Optional[ResourceResult] = None
self.__scroll_offset: int = 0
self.__document_mode: DocumentMode = DocumentMode.MARKDOWN
self.__cached_content_lines: Optional[list[str]] = None
self.__cached_mode: Optional[DocumentMode] = None
self.__cached_query: Optional[str] = None
self.__search_terms: list[str] = []
@property
def document_mode(self) -> DocumentMode:
return self.__document_mode
@property
def scroll_offset(self) -> int:
return self.__scroll_offset
@property
def url(self) -> str:
return self.__document.url if self.__document else ""
def clear(self) -> None:
"""
Clear the document.
"""
self.__document = None
self.__scroll_offset = 0
self.__invalidate_cache()
def draw_inner_footer(self, stdscr: curses.window, bounds: ViewBounds, text: str) -> None:
"""
Draw document footer with scroll position and mode switcher.
Args:
stdscr: The curses window to draw on
bounds: The view bounds defining the drawing area
text: The footer text to display
"""
if not self.__document:
super().draw_inner_footer(stdscr, bounds, text)
return
style: int = self._get_inner_header_style()
footer_y: int = bounds.y + bounds.height - 1
terminal_height: int
terminal_height, _ = stdscr.getmaxyx()
if footer_y >= terminal_height:
return
content_lines: list[str] = self.__get_content_lines()
content_height: int = max(0, bounds.height - 4)
total_lines: int = len(content_lines)
showing_start: int = self.__scroll_offset + 1
showing_end: int = min(total_lines, self.__scroll_offset + content_height)
left_info: str = f"Viewing lines {showing_start}-{showing_end} of {total_lines}"
modes: list[tuple[str, DocumentMode]] = [
(" MD ", DocumentMode.MARKDOWN),
(" RAW ", DocumentMode.RAW),
(" HDR ", DocumentMode.HEADERS)
]
mode_buttons_width: int = sum(len(mode_name) for mode_name, _ in modes)
mode_start_x: int = bounds.width - mode_buttons_width - 1
document_mode_style: int = self.session.get_theme_color_pair(ThemeDefinition.DOCUMENT_MODE)
safe_addstr(stdscr, footer_y, 0, self._get_bounded_line(), style)
safe_addstr(stdscr, footer_y, 1, left_info, style)
if mode_start_x > len(left_info) + 3:
current_x: int = mode_start_x
for mode_name, mode_enum in modes:
is_current: bool = self.__document_mode == mode_enum
mode_style: int = document_mode_style if is_current else style
if current_x + len(mode_name) <= bounds.width:
safe_addstr(stdscr, footer_y, current_x, mode_name, mode_style)
current_x += len(mode_name)
def handle_input(self, key: int) -> bool:
"""
Handle document navigation input.
Args:
key: The curses key code from user input
Returns:
bool: True if the input was handled, False otherwise
"""
if not self._focused or not self.__document:
return False
handlers: dict[int, callable] = {
curses.KEY_UP: self.__scroll_up,
curses.KEY_DOWN: self.__scroll_down,
curses.KEY_LEFT: self.__jump_to_previous_highlight,
curses.KEY_RIGHT: self.__jump_to_next_highlight,
curses.KEY_PPAGE: lambda: self.__scroll_page_up(max(1, self.bounds.height - 4)),
curses.KEY_NPAGE: lambda: self.__scroll_page_down(max(1, self.bounds.height - 4)),
curses.KEY_HOME: self.__scroll_to_top,
curses.KEY_END: self.__scroll_to_bottom,
ord('\t'): self.__cycle_mode,
}
handler = handlers.get(key)
if handler:
handler()
return True
return False
def render(self, stdscr: curses.window) -> None:
"""
Render the document view within bounds with search highlighting.
Args:
stdscr: The curses window to draw on
"""
if not self._renderable(stdscr):
return
if not self.__document:
self.__render_no_document(stdscr)
return
xb: int = self.bounds.x
yb: int = self.bounds.y
y_current: int = yb + 2
y_max: int = yb + self.bounds.height
content_height: int = max(0, self.bounds.height - 4)
content_width: int = self.bounds.width - 4
content_lines: list[str] = self.__get_content_lines()
visible_lines: list[str] = content_lines[self.__scroll_offset: self.__scroll_offset + content_height]
self.__update_search_terms()
for i, line in enumerate(visible_lines):
line_y: int = y_current + i
if line_y >= self.bounds.height:
break
if self.__search_terms and line.strip():
self.__render_line_with_highlights(stdscr, line, line_y, 2, content_width)
else:
display_line: str = line[:content_width] if len(line) > content_width else line
safe_addstr(stdscr, line_y, 2, display_line)
def update(self, document: ResourceResult) -> None:
"""
Update the document and reset scroll position.
Args:
document: The resource result document to display
"""
self.__document = document
self.__scroll_offset = 0
self.__invalidate_cache()
def __calculate_max_scroll(self) -> int:
"""
Calculate maximum scroll offset based on content and view size.
Returns:
int: The maximum scroll offset value
"""
if not self.__document:
return 0
content_lines: list[str] = self.__get_content_lines()
content_height: int = max(0, self.bounds.height - 4)
return max(0, len(content_lines) - content_height)
def __cycle_mode(self) -> None:
"""
Cycle to the next document mode.
"""
self.__document_mode = DOCUMENT_MODE_NEXT.get(
self.__document_mode,
DocumentMode.MARKDOWN
)
self.__scroll_offset = 0
self.__invalidate_cache()
def __get_content_lines(self) -> list[str]:
"""
Get content lines based on current mode with caching.
Returns:
list[str]: The content lines for the current document mode
"""
current_query: str = self.session.searchform.query if hasattr(self.session, 'searchform') else ""
if (self.__cached_content_lines is not None and
self.__cached_mode == self.__document_mode and
self.__cached_query == current_query):
return self.__cached_content_lines
if not self.__document:
return []
content_lines: list[str]
if self.__document_mode == DocumentMode.MARKDOWN:
content_lines = self.__get_markdown_lines()
elif self.__document_mode == DocumentMode.RAW:
content_lines = self.__get_raw_lines()
elif self.__document_mode == DocumentMode.HEADERS:
content_lines = self.__get_header_lines()
else:
content_lines = ["Unknown document mode"]
self.__cached_content_lines = content_lines
self.__cached_mode = self.__document_mode
self.__cached_query = current_query
return content_lines
def __get_header_lines(self) -> list[str]:
"""
Get headers with proper wrapping.
Returns:
list[str]: The wrapped header lines
"""
if not self.__document.headers:
return ["No headers available for this resource."]
return self.__wrap_text_content(self.__document.headers)
def __get_markdown_lines(self) -> list[str]:
"""
Get markdown content with proper wrapping.
Returns:
list[str]: The wrapped markdown content lines
"""
raw_markdown: str = self.__document.get_extra("markdown")
if not raw_markdown:
return ["", "Markdown unavailable for this resource."]
return self.__wrap_text_content(raw_markdown)
def __get_raw_lines(self) -> list[str]:
"""
Get raw content with proper wrapping.
Returns:
list[str]: The wrapped raw content lines
"""
if not self.__document.content:
return ["No raw content available for this resource."]
return self.__wrap_text_content(self.__document.content.strip())
def __invalidate_cache(self) -> None:
"""
Invalidate cached content lines.
"""
self.__cached_content_lines = None
self.__cached_mode = None
self.__cached_query = None
def __jump_to_next_highlight(self) -> None:
"""
Jump to next highlight, positioning it at line 5 of screen.
"""
if not self.__search_terms:
return
content_lines: list[str] = self.__get_content_lines()
current_line: int = self.__scroll_offset + 3
for line_num in range(current_line + 1, len(content_lines)):
highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
content_lines[line_num],
self.__search_terms
)
if highlights:
self.__scroll_offset = max(0, line_num - 3)
return
for line_num in range(0, current_line + 1):
highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
content_lines[line_num],
self.__search_terms
)
if highlights:
self.__scroll_offset = max(0, line_num - 3)
return
def __jump_to_previous_highlight(self) -> None:
"""
Jump to previous highlight, positioning it at line 5 of screen.
"""
if not self.__search_terms:
return
content_lines: list[str] = self.__get_content_lines()
current_line: int = self.__scroll_offset + 3
for line_num in range(current_line - 1, -1, -1):
highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
content_lines[line_num],
self.__search_terms
)
if highlights:
self.__scroll_offset = max(0, line_num - 3)
return
for line_num in range(len(content_lines) - 1, current_line - 1, -1):
highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(
content_lines[line_num],
self.__search_terms
)
if highlights:
self.__scroll_offset = max(0, line_num - 3)
return
def __render_line_with_highlights(self, stdscr: curses.window, line: str, y: int, x: int, max_width: int) -> None:
"""
Render a line with search term highlighting using the shared utility.
Args:
stdscr: The curses window to draw on
line: The text line to render
y: Y position to render at
x: X position to render at
max_width: Maximum width for rendering
"""
if not line.strip():
return
highlights: list[HighlightSpan] = HighlightProcessor.find_highlights_in_text(line, self.__search_terms)
normal_style: int = curses.A_NORMAL
highlight_style: int = self.session.get_theme_color_pair(ThemeDefinition.SNIPPET_HIGHLIGHT)
HighlightProcessor.render_text_with_highlights(
stdscr, line, highlights, x, y, max_width, normal_style, highlight_style
)
def __render_no_document(self, stdscr: curses.window) -> None:
"""
Render message when no document is loaded.
Args:
stdscr: The curses window to draw on
"""
x: int = self.bounds.x
y: int = self.bounds.y
width: int = self.bounds.width
height: int = self.bounds.height
if height > 2 and width > 20:
safe_addstr(stdscr, y + 2, x + 2, "No document loaded.", curses.A_DIM)
def __scroll_down(self, lines: int = 1) -> None:
"""
Scroll down by specified number of lines.
Args:
lines: Number of lines to scroll down
"""
max_scroll: int = self.__calculate_max_scroll()
self.__scroll_offset = min(max_scroll, self.__scroll_offset + lines)
def __scroll_page_down(self, page_size: int = 10) -> None:
"""
Scroll down by page.
Args:
page_size: Number of lines to scroll for a page
"""
self.__scroll_down(page_size)
def __scroll_page_up(self, page_size: int = 10) -> None:
"""
Scroll up by page.
Args:
page_size: Number of lines to scroll for a page
"""
self.__scroll_up(page_size)
def __scroll_to_bottom(self) -> None:
"""
Scroll to bottom of document.
"""
self.__scroll_offset = self.__calculate_max_scroll()
def __scroll_to_top(self) -> None:
"""
Scroll to top of document.
"""
self.__scroll_offset = 0
def __scroll_up(self, lines: int = 1) -> None:
"""
Scroll up by specified number of lines.
Args:
lines: Number of lines to scroll up
"""
self.__scroll_offset = max(0, self.__scroll_offset - lines)
def __update_search_terms(self) -> None:
"""
Update search terms from current search form query using shared utility.
"""
if hasattr(self.session, 'searchform') and self.session.searchform:
query: str = self.session.searchform.query
self.__search_terms = HighlightProcessor.extract_search_terms(query)
else:
self.__search_terms = []
def __wrap_text_content(self, raw_text: str) -> list[str]:
"""
Wrap text content for display with proper line handling.
Args:
raw_text: The raw text content to wrap
Returns:
list[str]: The wrapped text lines
"""
if not raw_text:
return []
content_width: int = max(20, self.bounds.width - CONTENT_MARGIN)
wrapped_lines: list[str] = []
text_lines: list[str] = raw_text.split("\n")
for line in text_lines:
if not line.strip():
wrapped_lines.append("")
else:
wrapped: str = textwrap.fill(
line.rstrip(),
width=content_width,
expand_tabs=True,
replace_whitespace=False,
break_long_words=True,
break_on_hyphens=True
)
wrapped_lines.extend(wrapped.split("\n"))
return wrapped_lines
```
--------------------------------------------------------------------------------
/docs/mcp_server_webcrawl.crawlers.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers package — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="mcp_server_webcrawl.crawlers.base package" href="mcp_server_webcrawl.crawlers.base.html" />
<link rel="prev" title="mcp_server_webcrawl package" href="mcp_server_webcrawl.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/mcp_server_webcrawl.crawlers.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl-crawlers-package">
<h1>mcp_server_webcrawl.crawlers package<a class="headerlink" href="#mcp-server-webcrawl-crawlers-package" title="Link to this heading"></a></h1>
<section id="subpackages">
<h2>Subpackages<a class="headerlink" href="#subpackages" title="Link to this heading"></a></h2>
<div class="toctree-wrapper compound">
<ul>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html">mcp_server_webcrawl.crawlers.base package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.adapter">mcp_server_webcrawl.crawlers.base.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.api">mcp_server_webcrawl.crawlers.base.api module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.crawler">mcp_server_webcrawl.crawlers.base.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.indexed">mcp_server_webcrawl.crawlers.base.indexed module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base.tests">mcp_server_webcrawl.crawlers.base.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#module-mcp_server_webcrawl.crawlers.base">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html">mcp_server_webcrawl.crawlers.archivebox package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox.adapter">mcp_server_webcrawl.crawlers.archivebox.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox.crawler">mcp_server_webcrawl.crawlers.archivebox.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox.tests">mcp_server_webcrawl.crawlers.archivebox.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.archivebox.html#module-mcp_server_webcrawl.crawlers.archivebox">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html">mcp_server_webcrawl.crawlers.httrack package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack.adapter">mcp_server_webcrawl.crawlers.httrack.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack.crawler">mcp_server_webcrawl.crawlers.httrack.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack.tests">mcp_server_webcrawl.crawlers.httrack.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.httrack.html#module-mcp_server_webcrawl.crawlers.httrack">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html">mcp_server_webcrawl.crawlers.interrobot package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot.adapter">mcp_server_webcrawl.crawlers.interrobot.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot.crawler">mcp_server_webcrawl.crawlers.interrobot.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot.tests">mcp_server_webcrawl.crawlers.interrobot.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.interrobot.html#module-mcp_server_webcrawl.crawlers.interrobot">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html">mcp_server_webcrawl.crawlers.katana package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana.adapter">mcp_server_webcrawl.crawlers.katana.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana.crawler">mcp_server_webcrawl.crawlers.katana.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana.tests">mcp_server_webcrawl.crawlers.katana.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.katana.html#module-mcp_server_webcrawl.crawlers.katana">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html">mcp_server_webcrawl.crawlers.siteone package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone.adapter">mcp_server_webcrawl.crawlers.siteone.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone.crawler">mcp_server_webcrawl.crawlers.siteone.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone.tests">mcp_server_webcrawl.crawlers.siteone.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.siteone.html#module-mcp_server_webcrawl.crawlers.siteone">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html">mcp_server_webcrawl.crawlers.warc package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc.adapter">mcp_server_webcrawl.crawlers.warc.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc.crawler">mcp_server_webcrawl.crawlers.warc.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc.tests">mcp_server_webcrawl.crawlers.warc.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.warc.html#module-mcp_server_webcrawl.crawlers.warc">Module contents</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html">mcp_server_webcrawl.crawlers.wget package</a><ul>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#submodules">Submodules</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget.adapter">mcp_server_webcrawl.crawlers.wget.adapter module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget.crawler">mcp_server_webcrawl.crawlers.wget.crawler module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget.tests">mcp_server_webcrawl.crawlers.wget.tests module</a></li>
<li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.crawlers.wget.html#module-mcp_server_webcrawl.crawlers.wget">Module contents</a></li>
</ul>
</li>
</ul>
</div>
</section>
<section id="module-mcp_server_webcrawl.crawlers">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.get_fixture_directory">
<span class="sig-name descname"><span class="pre">get_fixture_directory</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers.html#get_fixture_directory"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.get_fixture_directory" title="Link to this definition"></a></dt>
<dd><dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.14)"><em>Path</em></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.get_crawler">
<span class="sig-name descname"><span class="pre">get_crawler</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">crawler_name</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers.html#get_crawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.get_crawler" title="Link to this definition"></a></dt>
<dd><p>lazy load crawler, some classes have additional package dependencies</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>crawler_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)"><em>str</em></a>) – </p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.14)">str</a> | None</p>
</dd>
</dl>
</dd></dl>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="mcp_server_webcrawl.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="mcp_server_webcrawl.crawlers.base.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.crawlers.base package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/interrobot/crawler.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.interrobot.crawler — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.interrobot.crawler</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.interrobot.crawler</h1><div class="highlight"><pre>
<span></span>
<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span> <span class="nn">mcp.types</span> <span class="kn">import</span> <span class="n">Tool</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.sites</span> <span class="kn">import</span> <span class="n">SiteResult</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.resources</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">RESOURCES_FIELDS_DEFAULT</span><span class="p">,</span>
<span class="n">RESOURCES_FIELDS_BASE</span><span class="p">,</span>
<span class="n">RESOURCES_DEFAULT_SORT_MAPPING</span><span class="p">,</span>
<span class="n">RESOURCES_FIELDS_OPTIONS</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.crawler</span> <span class="kn">import</span> <span class="n">BaseCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.interrobot.adapter</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">get_sites</span><span class="p">,</span>
<span class="n">get_resources</span><span class="p">,</span>
<span class="n">INTERROBOT_RESOURCE_FIELD_MAPPING</span><span class="p">,</span>
<span class="n">INTERROBOT_SITE_FIELD_MAPPING</span><span class="p">,</span>
<span class="n">INTERROBOT_SITE_FIELD_REQUIRED</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.tools</span> <span class="kn">import</span> <span class="n">get_crawler_tools</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="InterroBotCrawler">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.crawler.InterroBotCrawler">[docs]</a>
<span class="k">class</span> <span class="nc">InterroBotCrawler</span><span class="p">(</span><span class="n">BaseCrawler</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> A crawler implementation for InterroBot data sources.</span>
<span class="sd"> Provides functionality for accessing and searching web content from InterroBot.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="InterroBotCrawler.__init__">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.crawler.InterroBotCrawler.__init__">[docs]</a>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">datasrc</span><span class="p">:</span> <span class="n">Path</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Initialize the InterroBotCrawler with a data source path and required adapter functions.</span>
<span class="sd"> Args:</span>
<span class="sd"> datasrc: Path to the data source</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">datasrc</span><span class="p">,</span> <span class="n">get_sites</span><span class="p">,</span> <span class="n">get_resources</span><span class="p">,</span> <span class="n">resource_field_mapping</span><span class="o">=</span><span class="n">INTERROBOT_RESOURCE_FIELD_MAPPING</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">is_file</span><span class="p">()</span> <span class="ow">and</span> <span class="n">datasrc</span><span class="o">.</span><span class="n">suffix</span> <span class="o">==</span> <span class="s2">".db"</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2"> datasrc must be a db file"</span></div>
<div class="viewcode-block" id="InterroBotCrawler.mcp_list_tools">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.interrobot.html#mcp_server_webcrawl.crawlers.interrobot.crawler.InterroBotCrawler.mcp_list_tools">[docs]</a>
<span class="k">async</span> <span class="k">def</span> <span class="nf">mcp_list_tools</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">list</span><span class="p">[</span><span class="n">Tool</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> List available tools for this crawler.</span>
<span class="sd"> Returns:</span>
<span class="sd"> List of Tool objects</span>
<span class="sd"> """</span>
<span class="c1"># get the default crawler tools, then override necessary fields</span>
<span class="n">all_sites</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">SiteResult</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_adapter_get_sites</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="n">all_sites_ids</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">id</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">all_sites</span> <span class="k">if</span> <span class="n">s</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">s</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="nb">int</span><span class="p">)]</span>
<span class="n">default_tools</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Tool</span><span class="p">]</span> <span class="o">=</span> <span class="n">get_crawler_tools</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="n">all_sites</span><span class="p">)</span>
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">default_tools</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">,</span> <span class="s2">"expected exactly 2 Tools: sites and resources"</span>
<span class="c1"># can replace get_crawler_tools or extend, here it is overwritten from default</span>
<span class="c1"># you'd think maybe pass changes in, but no, it's better ad hoc</span>
<span class="n">default_sites_tool</span><span class="p">:</span> <span class="n">Tool</span>
<span class="n">default_resources_tool</span><span class="p">:</span> <span class="n">Tool</span>
<span class="n">default_sites_tool</span><span class="p">,</span> <span class="n">default_resources_tool</span> <span class="o">=</span> <span class="n">default_tools</span>
<span class="n">sites_field_options</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">INTERROBOT_SITE_FIELD_MAPPING</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span> <span class="o">-</span> <span class="nb">set</span><span class="p">(</span><span class="n">INTERROBOT_SITE_FIELD_REQUIRED</span><span class="p">))</span>
<span class="n">dst_props</span><span class="p">:</span> <span class="nb">dict</span> <span class="o">=</span> <span class="n">default_sites_tool</span><span class="o">.</span><span class="n">inputSchema</span><span class="p">[</span><span class="s2">"properties"</span><span class="p">]</span>
<span class="n">dst_props</span><span class="p">[</span><span class="s2">"fields"</span><span class="p">][</span><span class="s2">"items"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">sites_field_options</span>
<span class="n">resources_sort_options</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">RESOURCES_DEFAULT_SORT_MAPPING</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="n">all_sites_display</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">s</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2"> (site: </span><span class="si">{</span><span class="n">s</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">)"</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">all_sites</span><span class="p">])</span>
<span class="n">drt_props</span><span class="p">:</span> <span class="nb">dict</span> <span class="o">=</span> <span class="n">default_resources_tool</span><span class="o">.</span><span class="n">inputSchema</span><span class="p">[</span><span class="s2">"properties"</span><span class="p">]</span>
<span class="n">drt_props</span><span class="p">[</span><span class="s2">"fields"</span><span class="p">][</span><span class="s2">"items"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">RESOURCES_FIELDS_OPTIONS</span>
<span class="n">drt_props</span><span class="p">[</span><span class="s2">"sort"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">resources_sort_options</span>
<span class="n">drt_props</span><span class="p">[</span><span class="s2">"sites"</span><span class="p">][</span><span class="s2">"items"</span><span class="p">][</span><span class="s2">"enum"</span><span class="p">]</span> <span class="o">=</span> <span class="n">all_sites_ids</span>
<span class="n">drt_props</span><span class="p">[</span><span class="s2">"sites"</span><span class="p">][</span><span class="s2">"description"</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">"Optional "</span>
<span class="s2">"list of project ID to filter search results to a specific site. In 95% "</span>
<span class="s2">"of scenarios, you'd filter to only one site, but many site filtering is offered "</span>
<span class="sa">f</span><span class="s2">"for advanced search scenarios. Available sites include </span><span class="si">{</span><span class="n">all_sites_display</span><span class="si">}</span><span class="s2">."</span><span class="p">)</span>
<span class="k">return</span> <span class="p">[</span><span class="n">default_sites_tool</span><span class="p">,</span> <span class="n">default_resources_tool</span><span class="p">]</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/base/crawler.py:
--------------------------------------------------------------------------------
```python
import anyio
import re
import sqlite3
import traceback
from pathlib import Path
from typing import Any, Callable, Final
from urllib.parse import urlparse
from mcp.server import NotificationOptions, Server
from mcp.server.models import InitializationOptions
from mcp.types import EmbeddedResource, ImageContent, TextContent, Tool
from mcp_server_webcrawl.crawlers.base.api import BaseJsonApi
from mcp_server_webcrawl.crawlers.base.adapter import IndexState
from mcp_server_webcrawl.models.base import METADATA_VALUE_TYPE
from mcp_server_webcrawl.models.sites import SITES_TOOL_NAME
from mcp_server_webcrawl.models.resources import (
ResourceResult,
ResourceResultType,
RESOURCES_DEFAULT_FIELD_MAPPING,
RESOURCE_EXTRAS_ALLOWED,
RESOURCES_TOOL_NAME,
)
from mcp_server_webcrawl.extras.thumbnails import ThumbnailManager
from mcp_server_webcrawl.extras.markdown import get_markdown
from mcp_server_webcrawl.extras.regex import get_regex
from mcp_server_webcrawl.extras.snippets import get_snippets
from mcp_server_webcrawl.extras.xpath import get_xpath
from mcp_server_webcrawl.utils.logger import get_logger
OVERRIDE_ERROR_MESSAGE: Final[str] = """BaseCrawler subclasses must implement the following \
methods: handle_list_tools, handle_call_tool, at minimum."""
logger = get_logger()
class BaseCrawler:
"""
Base crawler class that implements MCP server functionality.
This class provides the foundation for specialized crawlers to interact with
the MCP server and handle tool operations for web resources.
"""
def __init__(
self,
datasrc: Path,
get_sites_func: Callable,
get_resources_func: Callable,
resource_field_mapping: dict[str, str] = RESOURCES_DEFAULT_FIELD_MAPPING,
) -> None:
"""
Initialize the BaseCrawler with a data source path and required adapter functions.
Args:
datasrc: path to the data source
get_sites_func: function to retrieve sites from the data source
get_resources_func: function to retrieve resources from the data source
resource_field_mapping: mapping of resource field names to display names
"""
from mcp_server_webcrawl import __name__ as module_name, __version__ as module_version
assert datasrc is not None, f"{self.__class__.__name__} needs a datasrc, regardless of action"
assert callable(get_sites_func), f"{self.__class__.__name__} requires a callable get_sites_func"
assert callable(get_resources_func), f"{self.__class__.__name__} requires a callable get_resources_func"
assert isinstance(resource_field_mapping, dict), f"{self.__class__.__name__} resource_field_mapping must be a dict"
self._datasrc: Path = Path(datasrc)
self._module_name: str = module_name
self._module_version: str = module_version
self._server = Server(self._module_name)
self._server.list_tools()(self.mcp_list_tools)
self._server.call_tool()(self.mcp_call_tool)
self._server.list_prompts()(self.mcp_list_prompts)
self._server.list_resources()(self.mcp_list_resources)
self._resource_field_mapping = resource_field_mapping
self._adapter_get_sites = get_sites_func
self._adapter_get_resources = get_resources_func
@property
def datasrc(self) -> Path:
return self._datasrc
async def mcp_list_prompts(self) -> list:
"""List available prompts (currently none)."""
return []
async def mcp_list_resources(self) -> list:
"""List available resources (currently none)."""
return []
async def serve(self, stdin: anyio.AsyncFile[str] | None, stdout: anyio.AsyncFile[str] | None) -> dict[str, Any]:
"""
Launch the awaitable server.
Args:
stdin: input stream for the server
stdout: output stream for the server
Returns:
The MCP server over stdio
"""
return await self._server.run(stdin, stdout, self.get_initialization_options())
def get_initialization_options(self) -> InitializationOptions:
"""
Get the MCP initialization object.
Returns:
Dictionary containing project information
"""
notification_events = NotificationOptions(prompts_changed=False, resources_changed=False, tools_changed=False)
capabilities = self._server.get_capabilities(notification_options=notification_events, experimental_capabilities={})
return InitializationOptions(server_name=self._module_name, server_version=self._module_version, capabilities=capabilities)
def get_sites_api_json(self, **kwargs) -> str:
"""
Get sites API result as JSON.
Returns:
JSON string of sites API results
"""
json_result = self.get_sites_api(**kwargs)
return json_result.to_json()
def get_resources_api_json(self, **kwargs) -> str:
"""
Get resources API result as JSON.
Returns:
JSON string of resources API results
"""
json_result = self.get_resources_api(**kwargs)
return json_result.to_json()
def get_sites_api(
self,
ids: list[int] | None = None,
fields: list[str] | None = None,
) -> BaseJsonApi:
sites = self._adapter_get_sites(self._datasrc, ids=ids, fields=fields)
sites_kwargs = {
"ids": ids,
"fields": fields,
}
json_result = BaseJsonApi("GetProjects", sites_kwargs)
json_result.set_results(sites, len(sites), 0, len(sites))
return json_result
def get_resources_api(
self,
sites: list[int] | None = None,
query: str = "",
fields: list[str] | None = None,
sort: str | None = None,
limit: int = 20,
offset: int = 0,
extras: list[str] | None = None,
extrasRegex: list[str] | None = None,
extrasXpath: list[str] | None = None,
) -> BaseJsonApi:
resources_kwargs: dict[str, METADATA_VALUE_TYPE] = {
"sites": sites,
"query": query,
"fields": fields,
"sort": sort,
"limit": limit,
"offset": offset,
}
def no_results() -> BaseJsonApi:
api_result = BaseJsonApi("GetResources", resources_kwargs, index_state=IndexState())
api_result.set_results([], 0, 0, limit)
return api_result
if not sites:
all_sites = self._adapter_get_sites(self._datasrc)
if not all_sites:
return no_results()
# set to default of all sites if not specified
sites = [site.id for site in all_sites]
# sometimes the AI gets it in its head this is a good idea
# but it means no query, just take care of it here
if query.strip() in ('""',"''", "``", "*"):
query = ""
site_matches = self._adapter_get_sites(self._datasrc, ids=sites)
if not site_matches:
return no_results()
extras = extras or []
extrasXpath = extrasXpath or []
extrasRegex = extrasRegex or []
fields = fields or []
fields_extras_override: list[str] = fields.copy()
set_extras: set[str] = set(extras)
set_extras_content: set[str] = {"markdown", "snippets", "xpath", "regex"}
set_extras_headers: set[str] = {"snippets", "regex"}
add_content: bool = bool(set_extras_content & set_extras)
add_headers: bool = bool(set_extras_headers & set_extras)
if add_content and "content" not in fields:
fields_extras_override.append("content")
if add_headers and "headers" not in fields:
fields_extras_override.append("headers")
results, total, index_state = self._adapter_get_resources(
self._datasrc,
sites=sites,
query=query,
fields=fields_extras_override,
sort=sort,
limit=limit,
offset=offset,
)
if "markdown" in extras:
result: ResourceResult
for result in results:
markdown_result: str | None = None
if result.type == ResourceResultType.PAGE:
markdown_result = get_markdown(result.content)
result.set_extra("markdown", markdown_result)
if "xpath" in extras:
result: ResourceResult
for result in results:
xpath_result: list[dict[str, str | int | float]] = get_xpath(result.content, extrasXpath)
result.set_extra("xpath", xpath_result)
if "regex" in extras:
result: ResourceResult
for result in results:
regex_result: list[dict[str, str | int | float]] = get_regex(result.headers, result.content, extrasRegex)
result.set_extra("regex", regex_result)
if "snippets" in extras and query.strip():
result: ResourceResult
for result in results:
snippets: str | None = get_snippets(result.url, result.headers, result.content, query)
result.set_extra("snippets", snippets)
extras_only_fields = set(fields_extras_override) - set(fields)
if extras_only_fields:
for result in results:
for field in extras_only_fields:
if hasattr(result, field):
setattr(result, field, None)
# note: thumbnails extra a special case, handled in mcp_call_tool
api_result = BaseJsonApi("GetResources", resources_kwargs, index_state=index_state)
api_result.set_results(results, total, offset, limit)
return api_result
async def mcp_list_tools(self) -> list[Tool]:
"""
List available tools.
Returns:
List of available tools
Raises:
NotImplementedError: This method must be implemented by subclasses
"""
# each crawler subclass must provide this method
raise NotImplementedError(OVERRIDE_ERROR_MESSAGE)
async def mcp_call_tool(self, name: str, arguments: dict[str, Any] | None
) -> list[TextContent | ImageContent | EmbeddedResource]:
"""
Handle tool execution requests. You can override this or super(), then tweak.
Basically, it is a passthrough.
Args:
name: name of the tool to call
arguments: arguments to pass to the tool
Returns:
List of content objects resulting from the tool execution
Raises:
ValueError: If the specified tool does not exist
"""
try:
if name == SITES_TOOL_NAME:
ids: list[int] = [] if not arguments or "ids" not in arguments else arguments["ids"]
fields: list[str] = [] if not arguments or "fields" not in arguments else arguments["fields"]
assert isinstance(ids, list) and all(isinstance(item, int) for item in ids)
assert isinstance(fields, list) and all(isinstance(item, str) for item in fields)
results_json = self.get_sites_api_json(
ids=ids,
fields=fields
)
return [TextContent(type="text", text=results_json)]
elif name == RESOURCES_TOOL_NAME:
extras: list[str] = [] if not arguments or "extras" not in arguments else arguments["extras"]
# in case there is any LLM confusion of XPath/xpath or Markdown/markdown, these are
# defined lowercase in the MCP Tool definition, but have counter-weighting as proper nouns
extras = [extra.lower() for extra in extras if isinstance(extra, str)]
extrasRegex: list[str] = [] if not arguments or "extrasRegex" not in arguments else arguments["extrasRegex"]
extrasXpath: list[str] = [] if not arguments or "extrasXpath" not in arguments else arguments["extrasXpath"]
extras_set: set[str] = set(extras)
extras_removed: set[str] = extras_set - RESOURCE_EXTRAS_ALLOWED
if extras_removed:
# only allow known extras
extras = list(RESOURCE_EXTRAS_ALLOWED.intersection(extras))
# regular args pass through to the result
query: str = "" if not arguments or "query" not in arguments else arguments["query"]
fields: list[str] = [] if not arguments or "fields" not in arguments else arguments["fields"]
sites: list[int] = [] if not arguments or "sites" not in arguments else arguments["sites"]
sort: str | None = None if not arguments or "sort" not in arguments else arguments["sort"]
limit: int = 20 if not arguments or "limit" not in arguments else arguments["limit"]
offset: int = 0 if not arguments or "offset" not in arguments else arguments["offset"]
# claude keeps getting this wrong, it is properly enumerated in Tool definition
clean_sort = sort.strip("\"'`") if isinstance(sort, str) else None
assert isinstance(query, str)
assert isinstance(fields, list) and all(isinstance(item, str) for item in fields)
assert isinstance(sites, list) and all(isinstance(item, int) for item in sites)
assert isinstance(sort, (str, type(None)))
assert isinstance(limit, int)
assert isinstance(offset, int)
assert isinstance(extras, list) and all(isinstance(item, str) for item in extras)
assert isinstance(extrasXpath, list) and all(isinstance(item, str) for item in extrasXpath)
api_result: BaseJsonApi = self.get_resources_api(
sites=sites,
query=query,
fields=fields,
sort=clean_sort,
limit=limit,
offset=offset,
extras=extras,
extrasRegex=extrasRegex,
extrasXpath=extrasXpath,
)
# sometimes nudging makes things worse, AI doubles down on percieved
# rightousness of position. just let it have it. claims in the end it's
# a JSON encoding confusion with the +/- leading char. who knows? more
# importantly, who cares? play it loose.
# if sort != clean_sort:
# # let the MCP host know the error of its ways
# api_result.append_error(f"invalid sort ({sort}) requested [{', '.join(RESOURCES_DEFAULT_SORT_MAPPING.keys())}]")
if extras_removed:
# only allow known extras
api_result.append_error(f"invalid extras requested ({', '.join(extras_removed)})")
crawl_results: list[ResourceResult] = api_result.get_results()
results_json = api_result.to_json()
mcp_result = [TextContent(type="text", text=results_json)]
if "thumbnails" in extras:
crawl_results: list[ResourceResult] = api_result.get_results()
mcp_result += self.get_thumbnails(crawl_results) or []
return mcp_result
else:
raise ValueError(f"No such tool ({name})")
except sqlite3.Error as ex:
return [TextContent(type="text", text=f"mcp_call_tool/database\n{str(ex)}\n{traceback.format_exc()}")]
except Exception as ex:
return [TextContent(type="text", text=f"mcp_call_tool/exception\n{str(ex)}\n{traceback.format_exc()}")]
def get_thumbnails(self, results: list[ResourceResult]) -> list[ImageContent]:
thumbnails_result: list[ImageContent] = []
image_paths = list(set([result.url for result in results if result.url and
result.type == ResourceResultType.IMAGE]))
valid_paths = []
for path in image_paths:
parsed = urlparse(path)
if parsed.scheme in ("http", "https") and parsed.netloc:
clean_path: str = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
valid_paths.append(clean_path)
elif re.search(r"\.(jpg|jpeg|png|gif|bmp|webp)$", path, re.IGNORECASE):
clean_path: str = path.split("?")[0]
valid_paths.append(clean_path)
if valid_paths:
try:
thumbnail_manager = ThumbnailManager()
thumbnail_data = thumbnail_manager.get_thumbnails(valid_paths)
for thumbnail_url, thumbnail_base64 in thumbnail_data.items():
if thumbnail_base64 is None:
logger.debug(f"Thumbnail encountered error during request. {thumbnail_url}")
continue
image_content = ImageContent(type="image", data=thumbnail_base64, mimeType="image/webp")
thumbnails_result.append(image_content)
logger.debug(f"Fetched {len(thumbnail_data)} thumbnails out of {len(valid_paths)} requested URLs")
except Exception as ex:
logger.error(f"Error fetching thumbnails: {ex}\n{traceback.format_exc()}")
return thumbnails_result
def _convert_to_resource_types(self, types: list[str] | None) -> list[ResourceResultType] | None:
"""
Convert string type values to ResourceResultType enums. Silently ignore invalid type strings.
Args:
types: optional list of string type values
Returns:
Optional list of ResourceResultType enums, or None if no valid types
"""
if not types:
return None
result = [rt for rt in ResourceResultType if rt.value in types]
return result if result else None
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/katana/tests.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.katana.tests — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.katana.tests</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.katana.tests</h1><div class="highlight"><pre>
<span></span><span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.crawler</span> <span class="kn">import</span> <span class="n">KatanaCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.adapter</span> <span class="kn">import</span> <span class="n">KatanaManager</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.adapter</span> <span class="kn">import</span> <span class="n">SitesGroup</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="c1"># calculate ids for test directories using the same hash function as adapter</span>
<span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="n">KatanaManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"example.com"</span><span class="p">)</span>
<span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="n">KatanaManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"pragmar.com"</span><span class="p">)</span>
<span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="KatanaTests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests">[docs]</a>
<span class="k">class</span> <span class="nc">KatanaTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> test suite for the HTTP text crawler implementation.</span>
<span class="sd"> tests parsing and retrieval of web content from HTTP text files.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="KatanaTests.setUp">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.setUp">[docs]</a>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> set up the test environment with fixture data.</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"katana"</span></div>
<div class="viewcode-block" id="KatanaTests.test_katana_pulse">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_pulse">[docs]</a>
<span class="k">def</span> <span class="nf">test_katana_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> basic crawler initialization.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
<div class="viewcode-block" id="KatanaTests.test_katana_sites">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_sites">[docs]</a>
<span class="k">def</span> <span class="nf">test_katana_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> site retrieval API functionality.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_katana_search">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_search">[docs]</a>
<span class="k">def</span> <span class="nf">test_katana_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> boolean search tests</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_pragmar_tokenizer">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_pragmar_tokenizer">[docs]</a>
<span class="k">def</span> <span class="nf">test_pragmar_tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> tokenizer search tests</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_tokenizer_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_katana_resources">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_resources">[docs]</a>
<span class="k">def</span> <span class="nf">test_katana_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> resource retrieval API functionality with various parameters.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_interrobot_images">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_interrobot_images">[docs]</a>
<span class="k">def</span> <span class="nf">test_interrobot_images</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test InterroBot-specific image handling and thumbnails.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_image_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_katana_sorts">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_sorts">[docs]</a>
<span class="k">def</span> <span class="nf">test_katana_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> random sort functionality using the '?' sort parameter.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_katana_content_parsing">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_content_parsing">[docs]</a>
<span class="k">def</span> <span class="nf">test_katana_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> content type detection and parsing for HTTP text files.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="KatanaTests.test_report">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_report">[docs]</a>
<span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Run test report, save to data directory.</span>
<span class="sd"> """</span>
<span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">"Katana"</span><span class="p">))</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/_static/searchtools.js:
--------------------------------------------------------------------------------
```javascript
/*
* searchtools.js
* ~~~~~~~~~~~~~~~~
*
* Sphinx JavaScript utilities for the full-text search.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
"use strict";
/**
* Simple result scoring code.
*/
if (typeof Scorer === "undefined") {
var Scorer = {
// Implement the following function to further tweak the score for each result
// The function takes a result array [docname, title, anchor, descr, score, filename]
// and returns the new score.
/*
score: result => {
const [docname, title, anchor, descr, score, filename] = result
return score
},
*/
// query matches the full name of an object
objNameMatch: 11,
// or matches in the last dotted part of the object name
objPartialMatch: 6,
// Additive scores depending on the priority of the object
objPrio: {
0: 15, // used to be importantResults
1: 5, // used to be objectResults
2: -5, // used to be unimportantResults
},
// Used when the priority is not in the mapping.
objPrioDefault: 0,
// query found in title
title: 15,
partialTitle: 7,
// query found in terms
term: 5,
partialTerm: 2,
};
}
const _removeChildren = (element) => {
while (element && element.lastChild) element.removeChild(element.lastChild);
};
/**
* See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping
*/
const _escapeRegExp = (string) =>
string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
const _displayItem = (item, searchTerms, highlightTerms) => {
const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
const contentRoot = document.documentElement.dataset.content_root;
const [docName, title, anchor, descr, score, _filename] = item;
let listItem = document.createElement("li");
let requestUrl;
let linkUrl;
if (docBuilder === "dirhtml") {
// dirhtml builder
let dirname = docName + "/";
if (dirname.match(/\/index\/$/))
dirname = dirname.substring(0, dirname.length - 6);
else if (dirname === "index/") dirname = "";
requestUrl = contentRoot + dirname;
linkUrl = requestUrl;
} else {
// normal html builders
requestUrl = contentRoot + docName + docFileSuffix;
linkUrl = docName + docLinkSuffix;
}
let linkEl = listItem.appendChild(document.createElement("a"));
linkEl.href = linkUrl + anchor;
linkEl.dataset.score = score;
linkEl.innerHTML = title;
if (descr) {
listItem.appendChild(document.createElement("span")).innerHTML =
" (" + descr + ")";
// highlight search terms in the description
if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
}
else if (showSearchSummary)
fetch(requestUrl)
.then((responseData) => responseData.text())
.then((data) => {
if (data)
listItem.appendChild(
Search.makeSearchSummary(data, searchTerms)
);
// highlight search terms in the summary
if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
});
Search.output.appendChild(listItem);
};
const _finishSearch = (resultCount) => {
Search.stopPulse();
Search.title.innerText = _("Search Results");
if (!resultCount)
Search.status.innerText = Documentation.gettext(
"Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories."
);
else
Search.status.innerText = _(
`Search finished, found ${resultCount} page(s) matching the search query.`
);
};
const _displayNextItem = (
results,
resultCount,
searchTerms,
highlightTerms,
) => {
// results left, load the summary and display it
// this is intended to be dynamic (don't sub resultsCount)
if (results.length) {
_displayItem(results.pop(), searchTerms, highlightTerms);
setTimeout(
() => _displayNextItem(results, resultCount, searchTerms, highlightTerms),
5
);
}
// search finished, update title and status message
else _finishSearch(resultCount);
};
/**
* Default splitQuery function. Can be overridden in ``sphinx.search`` with a
* custom function per language.
*
* The regular expression works by splitting the string on consecutive characters
* that are not Unicode letters, numbers, underscores, or emoji characters.
* This is the same as ``\W+`` in Python, preserving the surrogate pair area.
*/
if (typeof splitQuery === "undefined") {
var splitQuery = (query) => query
.split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
.filter(term => term) // remove remaining empty strings
}
/**
* Search Module
*/
const Search = {
_index: null,
_queued_query: null,
_pulse_status: -1,
htmlToText: (htmlString) => {
const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
const docContent = htmlElement.querySelector('[role="main"]');
if (docContent !== undefined) return docContent.textContent;
console.warn(
"Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
);
return "";
},
init: () => {
const query = new URLSearchParams(window.location.search).get("q");
document
.querySelectorAll('input[name="q"]')
.forEach((el) => (el.value = query));
if (query) Search.performSearch(query);
},
loadIndex: (url) =>
(document.body.appendChild(document.createElement("script")).src = url),
setIndex: (index) => {
Search._index = index;
if (Search._queued_query !== null) {
const query = Search._queued_query;
Search._queued_query = null;
Search.query(query);
}
},
hasIndex: () => Search._index !== null,
deferQuery: (query) => (Search._queued_query = query),
stopPulse: () => (Search._pulse_status = -1),
startPulse: () => {
if (Search._pulse_status >= 0) return;
const pulse = () => {
Search._pulse_status = (Search._pulse_status + 1) % 4;
Search.dots.innerText = ".".repeat(Search._pulse_status);
if (Search._pulse_status >= 0) window.setTimeout(pulse, 500);
};
pulse();
},
/**
* perform a search for something (or wait until index is loaded)
*/
performSearch: (query) => {
// create the required interface elements
const searchText = document.createElement("h2");
searchText.textContent = _("Searching");
const searchSummary = document.createElement("p");
searchSummary.classList.add("search-summary");
searchSummary.innerText = "";
const searchList = document.createElement("ul");
searchList.classList.add("search");
const out = document.getElementById("search-results");
Search.title = out.appendChild(searchText);
Search.dots = Search.title.appendChild(document.createElement("span"));
Search.status = out.appendChild(searchSummary);
Search.output = out.appendChild(searchList);
const searchProgress = document.getElementById("search-progress");
// Some themes don't use the search progress node
if (searchProgress) {
searchProgress.innerText = _("Preparing search...");
}
Search.startPulse();
// index already loaded, the browser was quick!
if (Search.hasIndex()) Search.query(query);
else Search.deferQuery(query);
},
/**
* execute search (requires search index to be loaded)
*/
query: (query) => {
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const titles = Search._index.titles;
const allTitles = Search._index.alltitles;
const indexEntries = Search._index.indexentries;
// stem the search terms and add them to the correct list
const stemmer = new Stemmer();
const searchTerms = new Set();
const excludedTerms = new Set();
const highlightTerms = new Set();
const objectTerms = new Set(splitQuery(query.toLowerCase().trim()));
splitQuery(query.trim()).forEach((queryTerm) => {
const queryTermLower = queryTerm.toLowerCase();
// maybe skip this "word"
// stopwords array is from language_data.js
if (
stopwords.indexOf(queryTermLower) !== -1 ||
queryTerm.match(/^\d+$/)
)
return;
// stem the word
let word = stemmer.stemWord(queryTermLower);
// select the correct list
if (word[0] === "-") excludedTerms.add(word.substr(1));
else {
searchTerms.add(word);
highlightTerms.add(queryTermLower);
}
});
if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js
localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" "))
}
// console.debug("SEARCH: searching for:");
// console.info("required: ", [...searchTerms]);
// console.info("excluded: ", [...excludedTerms]);
// array of [docname, title, anchor, descr, score, filename]
let results = [];
_removeChildren(document.getElementById("search-progress"));
const queryLower = query.toLowerCase();
for (const [title, foundTitles] of Object.entries(allTitles)) {
if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
for (const [file, id] of foundTitles) {
let score = Math.round(100 * queryLower.length / title.length)
results.push([
docNames[file],
titles[file] !== title ? `${titles[file]} > ${title}` : title,
id !== null ? "#" + id : "",
null,
score,
filenames[file],
]);
}
}
}
// search for explicit entries in index directives
for (const [entry, foundEntries] of Object.entries(indexEntries)) {
if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
for (const [file, id] of foundEntries) {
let score = Math.round(100 * queryLower.length / entry.length)
results.push([
docNames[file],
titles[file],
id ? "#" + id : "",
null,
score,
filenames[file],
]);
}
}
}
// lookup as object
objectTerms.forEach((term) =>
results.push(...Search.performObjectSearch(term, objectTerms))
);
// lookup as search terms in fulltext
results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
// let the scorer override scores with a custom scoring function
if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
// now sort the results by score (in opposite order of appearance, since the
// display function below uses pop() to retrieve items) and then
// alphabetically
results.sort((a, b) => {
const leftScore = a[4];
const rightScore = b[4];
if (leftScore === rightScore) {
// same score: sort alphabetically
const leftTitle = a[1].toLowerCase();
const rightTitle = b[1].toLowerCase();
if (leftTitle === rightTitle) return 0;
return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
}
return leftScore > rightScore ? 1 : -1;
});
// remove duplicate search results
// note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
let seen = new Set();
results = results.reverse().reduce((acc, result) => {
let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(',');
if (!seen.has(resultStr)) {
acc.push(result);
seen.add(resultStr);
}
return acc;
}, []);
results = results.reverse();
// for debugging
//Search.lastresults = results.slice(); // a copy
// console.info("search results:", Search.lastresults);
// print the results
_displayNextItem(results, results.length, searchTerms, highlightTerms);
},
/**
* search for object names
*/
performObjectSearch: (object, objectTerms) => {
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const objects = Search._index.objects;
const objNames = Search._index.objnames;
const titles = Search._index.titles;
const results = [];
const objectSearchCallback = (prefix, match) => {
const name = match[4]
const fullname = (prefix ? prefix + "." : "") + name;
const fullnameLower = fullname.toLowerCase();
if (fullnameLower.indexOf(object) < 0) return;
let score = 0;
const parts = fullnameLower.split(".");
// check for different match types: exact matches of full name or
// "last name" (i.e. last dotted part)
if (fullnameLower === object || parts.slice(-1)[0] === object)
score += Scorer.objNameMatch;
else if (parts.slice(-1)[0].indexOf(object) > -1)
score += Scorer.objPartialMatch; // matches in last name
const objName = objNames[match[1]][2];
const title = titles[match[0]];
// If more than one term searched for, we require other words to be
// found in the name/title/description
const otherTerms = new Set(objectTerms);
otherTerms.delete(object);
if (otherTerms.size > 0) {
const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase();
if (
[...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0)
)
return;
}
let anchor = match[3];
if (anchor === "") anchor = fullname;
else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname;
const descr = objName + _(", in ") + title;
// add custom score for some objects according to scorer
if (Scorer.objPrio.hasOwnProperty(match[2]))
score += Scorer.objPrio[match[2]];
else score += Scorer.objPrioDefault;
results.push([
docNames[match[0]],
fullname,
"#" + anchor,
descr,
score,
filenames[match[0]],
]);
};
Object.keys(objects).forEach((prefix) =>
objects[prefix].forEach((array) =>
objectSearchCallback(prefix, array)
)
);
return results;
},
/**
* search for full-text terms in the index
*/
performTermsSearch: (searchTerms, excludedTerms) => {
// prepare search
const terms = Search._index.terms;
const titleTerms = Search._index.titleterms;
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const titles = Search._index.titles;
const scoreMap = new Map();
const fileMap = new Map();
// perform the search on the required terms
searchTerms.forEach((word) => {
const files = [];
const arr = [
{ files: terms[word], score: Scorer.term },
{ files: titleTerms[word], score: Scorer.title },
];
// add support for partial matches
if (word.length > 2) {
const escapedWord = _escapeRegExp(word);
Object.keys(terms).forEach((term) => {
if (term.match(escapedWord) && !terms[word])
arr.push({ files: terms[term], score: Scorer.partialTerm });
});
Object.keys(titleTerms).forEach((term) => {
if (term.match(escapedWord) && !titleTerms[word])
arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
});
}
// no match but word was a required one
if (arr.every((record) => record.files === undefined)) return;
// found search word in contents
arr.forEach((record) => {
if (record.files === undefined) return;
let recordFiles = record.files;
if (recordFiles.length === undefined) recordFiles = [recordFiles];
files.push(...recordFiles);
// set score for the word in each file
recordFiles.forEach((file) => {
if (!scoreMap.has(file)) scoreMap.set(file, {});
scoreMap.get(file)[word] = record.score;
});
});
// create the mapping
files.forEach((file) => {
if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
fileMap.get(file).push(word);
else fileMap.set(file, [word]);
});
});
// now check if the files don't contain excluded terms
const results = [];
for (const [file, wordList] of fileMap) {
// check if all requirements are matched
// as search terms with length < 3 are discarded
const filteredTermCount = [...searchTerms].filter(
(term) => term.length > 2
).length;
if (
wordList.length !== searchTerms.size &&
wordList.length !== filteredTermCount
)
continue;
// ensure that none of the excluded terms is in the search result
if (
[...excludedTerms].some(
(term) =>
terms[term] === file ||
titleTerms[term] === file ||
(terms[term] || []).includes(file) ||
(titleTerms[term] || []).includes(file)
)
)
break;
// select one (max) score for the file.
const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w]));
// add result to the result list
results.push([
docNames[file],
titles[file],
"",
null,
score,
filenames[file],
]);
}
return results;
},
/**
* helper function to return a node containing the
* search summary for a given text. keywords is a list
* of stemmed words.
*/
makeSearchSummary: (htmlText, keywords) => {
const text = Search.htmlToText(htmlText);
if (text === "") return null;
const textLower = text.toLowerCase();
const actualStartPosition = [...keywords]
.map((k) => textLower.indexOf(k.toLowerCase()))
.filter((i) => i > -1)
.slice(-1)[0];
const startWithContext = Math.max(actualStartPosition - 120, 0);
const top = startWithContext === 0 ? "" : "...";
const tail = startWithContext + 240 < text.length ? "..." : "";
let summary = document.createElement("p");
summary.classList.add("context");
summary.textContent = top + text.substr(startWithContext, 240).trim() + tail;
return summary;
},
};
_ready(Search.init);
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/session.py:
--------------------------------------------------------------------------------
```python
import curses
import sys
import threading
import traceback
from pathlib import Path
from typing import Optional
from mcp_server_webcrawl.crawlers import get_crawler
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler, BaseJsonApi
from mcp_server_webcrawl.interactive.search import SearchManager
from mcp_server_webcrawl.interactive.ui import ThemeDefinition, UiState, DocumentMode, UiFocusable, ViewBounds, safe_addstr
from mcp_server_webcrawl.interactive.views.base import BaseCursesView, OUTER_WIDTH_RIGHT_MARGIN
from mcp_server_webcrawl.interactive.views.document import SearchDocumentView
from mcp_server_webcrawl.interactive.views.requirements import RequirementsView
from mcp_server_webcrawl.interactive.views.results import SearchResultsView
from mcp_server_webcrawl.interactive.views.searchform import SearchFormView
from mcp_server_webcrawl.interactive.views.help import HelpView
from mcp_server_webcrawl.models.sites import SiteResult
# can be as low as 1, 50 feels a little laggy
CURSES_TIMEOUT_MS = 25
LAYOUT_CONTENT_START_Y_OFFSET = 1
LAYOUT_CONTENT_END_Y_OFFSET = 1
LAYOUT_SPLIT_PANE_MAX_HEIGHT = 10
LAYOUT_MIN_HEIGHT_FOR_HELP = 2
DEBUG_MAX_LINES = 8
DEBUG_COMPACT_WIDTH_RATIO = 0.4
DEBUG_MIN_COMPACT_WIDTH = 30
DEBUG_COMPACT_THRESHOLD = 5
DEBUG_EXPANDED_MARGIN = 6
DEBUG_EXPANDED_START_X = 3
DEBUG_EXPANDED_BOTTOM_MARGIN = 3
DEBUG_COMPACT_BOTTOM_MARGIN = 2
DEBUG_MIN_START_Y = 1
DEBUG_MIN_START_Y_EXPANDED = 2
SEARCH_DOCUMENT_NEXT_MODE: dict[DocumentMode, DocumentMode] = {
DocumentMode.MARKDOWN: DocumentMode.RAW,
DocumentMode.RAW: DocumentMode.HEADERS,
DocumentMode.HEADERS: DocumentMode.MARKDOWN
}
SEARCH_RESULT_LIMIT: int = 10
TERMINAL_MIN_HEIGHT: int = 8
TERMINAL_MIN_WIDTH: int = 40
class InteractiveSession:
"""
Main session coordinator that manages the interactive terminal application.
"""
def __init__(self, crawler: str, datasrc: str):
"""
Initialize the interactive session with crawler and data source.
"""
self.__input_crawler: str = crawler
self.__input_datasrc: str = datasrc
self.__theme_map: dict[str, int] = {}
self.__searchman: SearchManager = SearchManager(self)
self.__ui_state: UiState = UiState.SEARCH_INIT
self.__ui_focused: UiFocusable = UiFocusable.SEARCH_FORM
self.__debug: list[str] = []
self.__view__requirements = RequirementsView(self, crawler, datasrc)
if self.__view__requirements.validated == True:
crawl_model = get_crawler(crawler)
if crawl_model is not None:
self.__crawler: BaseCrawler = crawl_model(Path(datasrc))
sites_api: BaseJsonApi = self.__crawler.get_sites_api()
self.__sites: list[SiteResult] = sites_api.get_results()
else:
self.__crawler: BaseCrawler = None
sites_api: BaseJsonApi = None
self.__sites: list[SiteResult] = []
else:
crawl_model = None
self.__crawler: BaseCrawler = None
sites_api: BaseJsonApi = None
self.__sites: list[SiteResult] = []
self.__view__results = SearchResultsView(self)
self.__view__document = SearchDocumentView(self)
self.__view__searchform = SearchFormView(self, self.__sites)
self.__view__help = HelpView(self)
self.set_ui_state(UiState.SEARCH_INIT, UiFocusable.SEARCH_FORM)
@property
def ui_state(self) -> UiState:
return self.__ui_state
@property
def ui_focused(self) -> UiFocusable:
return self.__ui_focused
@property
def crawler(self) -> BaseCrawler:
return self.__crawler
@property
def document(self) -> SearchDocumentView:
return self.__view__document
@property
def results(self) -> SearchResultsView:
return self.__view__results
@property
def searchform(self) -> SearchFormView:
return self.__view__searchform
@property
def searchman(self) -> SearchManager:
return self.__searchman
@property
def sites(self) -> list[SiteResult]:
return self.__sites.copy()
def debug_add(self, msg: str) -> None:
"""
Add line of debug.
"""
with threading.Lock():
self.__debug.append(msg)
def debug_clear(self) -> None:
"""
Clear debug statements.
"""
with threading.Lock():
self.__debug.clear()
def run(self) -> None:
"""
Public interface to launch the interactive terminal application.
"""
try:
curses.wrapper(self.__curses_main)
except KeyboardInterrupt:
pass # clean exit, ctrl+c
except Exception as ex:
print(f"--interactive failure: {ex}\n{traceback.format_exc()}", file=sys.stderr)
finally:
self.searchman.cleanup()
pass
def set_ui_state(self, state: UiState, focus: Optional[UiFocusable] = None) -> None:
"""
Transition between UI states cleanly.
"""
self.__ui_state = state
if focus is not None:
self.__ui_focused = focus
self.__view__results.set_focused(False)
self.__view__searchform.set_focused(False)
if state == UiState.SEARCH_INIT or (state == UiState.SEARCH_RESULTS and focus == UiFocusable.SEARCH_FORM):
self.__view__searchform.set_focused(True)
elif state == UiState.SEARCH_RESULTS:
self.__view__results.set_focused(True)
# used in requirements view to reset with user inputs over cmd args
def set_init_input_args(self, crawler: str, datasrc: str) -> None:
self.__input_crawler = crawler
self.__input_datasrc = datasrc
def set_init_crawler(self, crawler: BaseCrawler) -> None:
self.__crawler = crawler
def set_init_sites(self, sites: str) -> None:
self.__sites = sites
# used in requirements to reset app
def set_init_searchform(self, searchform: BaseCursesView) -> None:
self.__view__searchform = searchform
def __get_outer_screen(self, width: int, height: int) -> ViewBounds:
"""
Get the outer screen bounds for the full terminal.
"""
return ViewBounds(
x=0,
y=0,
width=width - OUTER_WIDTH_RIGHT_MARGIN,
height=height
)
def __get_inner_screen(self, width: int, height: int) -> ViewBounds:
"""
Get the inner screen bounds for content area.
"""
content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
content_end_y = height - LAYOUT_CONTENT_END_Y_OFFSET
content_height = content_end_y - content_start_y
return ViewBounds(
x=0,
y=content_start_y, # after outer header
width=width - OUTER_WIDTH_RIGHT_MARGIN,
height=content_height
)
def __get_split_top(self, width: int, height: int) -> ViewBounds:
"""
Get the top split screen bounds for dual-pane layout.
"""
content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
content_height = height - 2
split_top_height = min(LAYOUT_SPLIT_PANE_MAX_HEIGHT, content_height // 2)
return ViewBounds(
x=0,
y=content_start_y,
width=width - OUTER_WIDTH_RIGHT_MARGIN,
height=split_top_height
)
def __get_split_bottom(self, width: int, height: int) -> ViewBounds:
"""
Get the bottom split screen bounds for dual-pane layout.
"""
content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
content_height = height - 2
split_top_height = min(LAYOUT_SPLIT_PANE_MAX_HEIGHT, content_height // 2)
split_bottom_height = content_height - split_top_height
return ViewBounds(
x=0,
y=content_start_y + split_top_height,
width=width - OUTER_WIDTH_RIGHT_MARGIN,
height=split_bottom_height
)
def __curses_main(self, stdscr: curses.window) -> None:
"""
Initialize curses environment and start main loop.
"""
if curses.COLORS < 256:
# display error in curses, dependable
stdscr.addstr(0, 0, "--interactive mode requires a 256-color (or better) terminal")
stdscr.refresh()
stdscr.getch() # wait for keypress
sys.exit(1)
# initialize curses style pairs
curses.start_color()
for theme in ThemeDefinition:
self.__theme_map[theme.name] = theme.value
curses.init_pair(*theme.value)
# hide cursor, otherwise blinks at edge of last write
curses.curs_set(0)
# start main loop
self.__interactive_loop(stdscr)
def get_theme_color_pair(self, theme: ThemeDefinition) -> int | None:
if theme.name in self.__theme_map:
return curses.color_pair(self.__theme_map[theme.name][0])
else:
return None
def __get_help_text(self) -> str:
"""
Get context-sensitive help text.
"""
page_results: str = " | ←→ Page Results" if self.ui_focused == UiFocusable.SEARCH_RESULTS else ""
search_results_enter: str = "Search" if self.__view__searchform.focused else "View Document"
search_results_tab: str = "Results" if self.__view__searchform.focused else "Search Form"
footers: dict[UiState, str] = {
UiState.DOCUMENT: "↑↓: Scroll | PgUp/PgDn: Page | Home/End: Top/Bot | TAB: Mode | ESC: Back",
UiState.HELP: "↑↓: Scroll | PgUp/PgDn: Page | Home/End: Top/Bot | ESC: Back",
UiState.REQUIREMENTS: "ENTER: Load Interface | ↑↓: Navigate| ESC: Exit",
UiState.SEARCH_INIT: "ENTER: Search | ↑↓: Navigate | F1: Search Help | ESC: Exit",
UiState.SEARCH_RESULTS: f"ENTER: {search_results_enter} | ↑↓: Navigate{page_results} | TAB: {search_results_tab} | ESC: New Search",
}
return footers.get(self.__ui_state, "↑↓: Navigate | ESC: Exit")
def __handle_F1(self) -> None:
"""
Handle F1 key
"""
self.set_ui_state(UiState.HELP)
def __handle_ESC(self) -> None:
"""
Handle ESC key
"""
if self.__ui_state == UiState.DOCUMENT:
self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
elif self.__ui_state in (UiState.SEARCH_RESULTS, UiState.HELP):
self.set_ui_state(UiState.SEARCH_INIT, UiFocusable.SEARCH_FORM)
self.searchform.clear_query()
elif self.__ui_state in (UiState.SEARCH_INIT, UiState.REQUIREMENTS):
sys.exit(0)
def __handle_TAB(self) -> None:
"""
Handle TAB key
"""
if self.__ui_state == UiState.SEARCH_RESULTS:
if self.__ui_focused == UiFocusable.SEARCH_FORM:
self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
else:
self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_FORM)
def __interactive_loop(self, stdscr: curses.window) -> None:
"""
Main input loop.
"""
try:
stdscr.timeout(CURSES_TIMEOUT_MS)
while True:
self.searchman.check_pending()
stdscr.clear()
height, width = stdscr.getmaxyx()
selected_sites = self.__view__searchform.get_selected_sites()
if self.__ui_state == UiState.REQUIREMENTS or self.__view__requirements.validated == False:
if not self.__ui_state == UiState.REQUIREMENTS:
self.set_ui_state(UiState.REQUIREMENTS)
inner_screen = self.__get_inner_screen(width, height)
self.__view__requirements.draw_inner_header(stdscr, inner_screen, "Requirements:")
self.__view__requirements.set_bounds(inner_screen)
self.__view__requirements.render(stdscr)
self.__view__requirements.draw_inner_footer(stdscr, inner_screen, f"Waiting on input")
elif self.__ui_state == UiState.HELP:
inner_screen = self.__get_inner_screen(width, height)
self.__view__help.draw_inner_header(stdscr, inner_screen, "Search Help:")
self.__view__help.set_bounds(inner_screen)
self.__view__help.render(stdscr)
self.__view__help.draw_inner_footer(stdscr, inner_screen, f"ESC to Exit Help")
elif self.__ui_state == UiState.SEARCH_RESULTS and selected_sites:
inner_screen_split_top = self.__get_split_top(width, height)
inner_screen_split_bottom = self.__get_split_bottom(width, height)
url: str = selected_sites[0].urls[0] if selected_sites and selected_sites[0].urls else ""
display_url: str = BaseCursesView.url_for_display(url)
self.__view__searchform.draw_inner_header(stdscr, inner_screen_split_top, "Search:")
self.__view__searchform.set_bounds(inner_screen_split_top)
self.__view__searchform.render(stdscr)
self.__view__searchform.draw_inner_footer(stdscr, inner_screen_split_top, f"Searching {display_url}")
self.__view__results.draw_inner_header(stdscr, inner_screen_split_bottom, "")
self.__view__results.set_bounds(inner_screen_split_bottom)
self.__view__results.render(stdscr)
self.__view__results.draw_inner_footer(stdscr, inner_screen_split_bottom, "")
elif self.__ui_state == UiState.DOCUMENT:
inner_screen = self.__get_inner_screen(width, height)
url: str = self.__view__document.urls[0] if self.__view__document is not None and self.__view__document.urls else ""
display_url: str = BaseCursesView.url_for_display(url)
self.__view__document.set_focused(True)
self.__view__document.draw_inner_header(stdscr, inner_screen, f"URL: {display_url}")
self.__view__document.set_bounds(inner_screen)
self.__view__document.render(stdscr)
self.__view__document.draw_inner_footer(stdscr, inner_screen, f"")
else:
# aka self.__ui_state == UiState.SEARCH_INIT
inner_screen = self.__get_inner_screen(width, height)
self.__view__searchform.draw_inner_header(stdscr, inner_screen, "Search:")
selected_sites = self.__view__searchform.get_selected_sites()
first_hit = selected_sites[0] if selected_sites else None
url: str = first_hit.urls[0] if first_hit is not None and first_hit.urls else ""
display_url: str = BaseCursesView.url_for_display(url)
self.__view__searchform.set_bounds(inner_screen)
self.__view__searchform.render(stdscr)
self.__view__searchform.draw_inner_footer(stdscr, inner_screen, f"Searching {display_url}")
if height > LAYOUT_MIN_HEIGHT_FOR_HELP:
help_text = self.__get_help_text()
self.__view__searchform.draw_outer_header(stdscr)
self.__view__searchform.draw_outer_footer(stdscr, help_text)
self.__render_debug(stdscr)
stdscr.refresh()
key: int = stdscr.getch()
if key == -1: # timeout
continue
elif key == ord('\t'):
self.__handle_TAB()
elif key == curses.KEY_F1:
self.__handle_F1()
elif key == 27: # ESC
self.__handle_ESC()
if self.__view__requirements.validated == False or self.__ui_state == UiState.REQUIREMENTS:
if self.__view__requirements.handle_input(key):
continue
elif self.__ui_state == UiState.SEARCH_INIT or (
self.__ui_state == UiState.SEARCH_RESULTS
and self.__ui_focused == UiFocusable.SEARCH_FORM
):
if self.__view__searchform.handle_input(key):
continue
elif self.__ui_state == UiState.SEARCH_RESULTS:
if self.__view__results.handle_input(key):
continue
elif self.__ui_state == UiState.DOCUMENT:
if self.__view__document.handle_input(key):
continue
elif self.__ui_state == UiState.HELP:
if self.__view__help.handle_input(key):
continue
except Exception as ex:
print(f"--interactive failure - {ex}\n{traceback.format_exc()}")
pass
finally:
stdscr.timeout(-1)
def __render_debug(self, stdscr: curses.window) -> None:
"""
Render debug info with adaptive sizing - compact for short messages, expanded for errors.
"""
height, width = stdscr.getmaxyx()
with threading.Lock():
debug_lines = self.__debug[-(DEBUG_MAX_LINES):].copy()
if not debug_lines:
return
max_line_length = max(len(line) for line in debug_lines) if debug_lines else 0
compact_width = max(int(width * DEBUG_COMPACT_WIDTH_RATIO), DEBUG_MIN_COMPACT_WIDTH)
use_expanded = max_line_length > compact_width - DEBUG_COMPACT_THRESHOLD
if use_expanded:
debug_width: int = width - DEBUG_EXPANDED_MARGIN
debug_start_x: int = DEBUG_EXPANDED_START_X
debug_start_y: int = max(DEBUG_MIN_START_Y_EXPANDED, height - len(debug_lines) - DEBUG_EXPANDED_BOTTOM_MARGIN)
else:
debug_width: int = compact_width
debug_start_x: int = width - debug_width - DEBUG_EXPANDED_START_X
debug_start_y: int = height - len(debug_lines) - DEBUG_COMPACT_BOTTOM_MARGIN
debug_start_y: int = max(DEBUG_MIN_START_Y, debug_start_y)
debug_start_x: int = max(0, debug_start_x)
for i, debug_line in enumerate(debug_lines):
y_pos: int = debug_start_y + i
if y_pos >= height - 1:
break
if debug_start_x >= 0 and y_pos > 0:
display_line: str = debug_line[:debug_width]
safe_addstr(stdscr, y_pos, debug_start_x, display_line, self.get_theme_color_pair(ThemeDefinition.HEADER_ACTIVE))
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/templates/tests.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.templates.tests — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.templates.tests</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.templates.tests</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">re</span>
<span class="kn">import</span> <span class="nn">unittest</span>
<span class="kn">from</span> <span class="nn">importlib</span> <span class="kn">import</span> <span class="n">resources</span>
<span class="kn">from</span> <span class="nn">urllib.request</span> <span class="kn">import</span> <span class="n">urlopen</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.extras.markdown</span> <span class="kn">import</span> <span class="n">get_markdown</span>
<span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="TemplateTests">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests">[docs]</a>
<span class="k">class</span> <span class="nc">TemplateTests</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test suite for the custom HTML to markdown converter.</span>
<span class="sd"> Why custom? It's a bit faster, that is the only reason.</span>
<span class="sd"> Maximum load is 100 transforms (1 per result for a max result </span>
<span class="sd"> of 100), so speed matters. A default set is 20.</span>
<span class="sd"> This converter does a few things differently to tailor to LLM</span>
<span class="sd"> interaction.</span>
<span class="sd"> * aggressively removes images (html2text selectively renders)</span>
<span class="sd"> * links with block decendents will render like a <p> </span>
<span class="sd"> (html2text treats as <a><br>) </span>
<span class="sd"> """</span>
<div class="viewcode-block" id="TemplateTests.setUp">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests.setUp">[docs]</a>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Set up the test environment with fixture data.</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span></div>
<div class="viewcode-block" id="TemplateTests.test_core_html">
<a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html">[docs]</a>
<span class="k">def</span> <span class="nf">test_core_html</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">core_html</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">resources</span><span class="o">.</span><span class="n">read_text</span><span class="p">(</span><span class="s2">"mcp_server_webcrawl.templates"</span><span class="p">,</span> <span class="s2">"tests_core.html"</span><span class="p">)</span>
<span class="n">markdown</span> <span class="o">=</span> <span class="n">get_markdown</span><span class="p">(</span><span class="n">core_html</span><span class="p">)</span>
<span class="c1"># h1-6</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"# Lorem Ipsum Dolor Sit Amet"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"## Consectetur Adipiscing Elit"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"### Nemo Enim Ipsam Voluptatem"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"#### Sed Quia Non Numquam"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"##### Nisi Ut Aliquid Ex Ea"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"###### At Vero Eos Et Accusamus"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># no content loss - key phrases should be preserved</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"Lorem ipsum dolor sit amet"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"Definition List Example"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"More Text Elements"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># inline formatting (proper spacing)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"amet, **consectetur adipiscing elit**. Sed"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"laborum. **Sed ut perspiciatis** unde"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"consequat. *Duis aute irure dolor* in"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"laudantium. *Totam rem aperiam*, eaque"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># link formatting (proper spacing)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"veniam, quis nostrud exercitation ullamco"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># Fragment links as plain text</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"and a link back to top. Nam"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># list formatting</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"* Similique sunt in culpa"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"1. Temporibus autem quibusdam"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># dl/dt</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"**Lorem Ipsum**"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">" Dolor sit amet, consectetur adipiscing elit"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"**Ut Enim**"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">" Ad minim veniam, quis nostrud exercitation"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"**Duis Aute**"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">" Irure dolor in reprehenderit in voluptate"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># table structure</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"| Lorem | Ipsum | Dolor | Sit |"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"|---|---|---|---|"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"| Consectetur | Adipiscing | Elit | Sed |"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># code formatting</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"Here we have some `inline code` and"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"```</span><span class="se">\n</span><span class="s2">function lorem() {</span><span class="se">\n</span><span class="s2"> return </span><span class="se">\"</span><span class="s2">ipsum dolor sit amet</span><span class="se">\"</span><span class="s2">;</span><span class="se">\n</span><span class="s2">}</span><span class="se">\n</span><span class="s2">```"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># blockquotes</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"> </span><span class="se">\"</span><span class="s2">Sed ut perspiciatis unde omnis iste natus"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># horizontal rule</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"---"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
<span class="c1"># no double spacing for inline elements</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">"** "</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces after bold</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">" **"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces before bold</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">"* "</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces after emphasis</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">" *"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces before emphasis</span>
<span class="c1"># structural integrity - count major elements</span>
<span class="n">heading_count</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">"^#{1,6} "</span><span class="p">,</span> <span class="n">markdown</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">MULTILINE</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">heading_count</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="s2">"Should have exactly 6 headings"</span><span class="p">)</span>
<span class="n">table_count</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">"^\|.*\|$"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">MULTILINE</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">table_count</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="s2">"Should have multiple table rows"</span><span class="p">)</span></div>
</div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```