This is page 6 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/adapter.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import re
3 | import sqlite3
4 |
5 | from contextlib import closing
6 | from datetime import datetime, timezone
7 | from pathlib import Path
8 |
9 | from mcp_server_webcrawl.crawlers.base.adapter import (
10 | BaseManager,
11 | IndexState,
12 | IndexStatus,
13 | SitesGroup,
14 | INDEXED_BATCH_SIZE,
15 | INDEXED_BYTE_MULTIPLIER,
16 | INDEXED_RESOURCE_DEFAULT_PROTOCOL,
17 | INDEXED_TYPE_MAPPING,
18 | )
19 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
20 | from mcp_server_webcrawl.utils.logger import get_logger
21 | from mcp_server_webcrawl.models.resources import (
22 | ResourceResult,
23 | ResourceResultType,
24 | RESOURCES_LIMIT_DEFAULT,
25 | )
26 | from mcp_server_webcrawl.models.sites import (
27 | SiteResult,
28 | )
29 |
30 | SITEONE_LOG_TYPE_MAPPING = {
31 | "html": ResourceResultType.PAGE,
32 | "redirect": ResourceResultType.PAGE,
33 | "image": ResourceResultType.IMAGE,
34 | "js": ResourceResultType.SCRIPT,
35 | "css": ResourceResultType.CSS,
36 | "video": ResourceResultType.VIDEO,
37 | "audio": ResourceResultType.AUDIO,
38 | "pdf": ResourceResultType.PDF,
39 | "other": ResourceResultType.OTHER,
40 | "font": ResourceResultType.OTHER,
41 | }
42 |
43 | logger = get_logger()
44 |
45 | class SiteOneManager(IndexedManager):
46 | """
47 | Manages SiteOne directory data in in-memory SQLite databases.
48 | Wraps wget archive format (shared by SiteOne and wget)
49 | Provides connection pooling and caching for efficient access.
50 | """
51 |
52 | def __init__(self) -> None:
53 | """Initialize the SiteOne manager with empty cache and statistics."""
54 |
55 | super().__init__()
56 |
57 | def _extract_log_metadata(self, directory: Path) -> tuple[dict, dict]:
58 | """
59 | Extract metadata from SiteOne log files.
60 |
61 | Args:
62 | directory: path to the site directory
63 |
64 | Returns:
65 | Tuple of (success log data, error log data) dictionaries
66 | """
67 | directory_name: str = directory.name
68 | log_data = {}
69 | log_http_error_data = {}
70 |
71 | log_pattern: str = f"output.{directory_name}.*.txt"
72 | log_files = list(Path(directory.parent).glob(log_pattern))
73 |
74 | if not log_files:
75 | return log_data, log_http_error_data
76 |
77 | log_latest = max(log_files, key=lambda p: p.stat().st_mtime)
78 |
79 | try:
80 | with open(log_latest, "r", encoding="utf-8") as log_file:
81 | for line in log_file:
82 | parts = [part.strip() for part in line.split("|")]
83 | if len(parts) == 10:
84 | parts_path = parts[3].split("?")[0]
85 | try:
86 | status = int(parts[4])
87 | url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{directory_name}{parts_path}"
88 | time_str = parts[6].split()[0]
89 | time = int(float(time_str) * (1000 if "s" in parts[6] else 1))
90 |
91 | # size collected for errors, os stat preferred
92 | size_str = parts[7].strip()
93 | size = 0
94 | if size_str:
95 | size_value = float(size_str.split()[0])
96 | size_unit = size_str.split()[1].lower() if len(size_str.split()) > 1 else "b"
97 | multiplier = INDEXED_BYTE_MULTIPLIER.get(size_unit, 1)
98 | size = int(size_value * multiplier)
99 |
100 | if 400 <= status < 600:
101 | log_http_error_data[url] = {
102 | "status": status,
103 | "type": parts[5].lower(),
104 | "time": time,
105 | "size": size,
106 | }
107 | else:
108 | log_data[url] = {
109 | "status": status,
110 | "type": parts[5].lower(),
111 | "time": time,
112 | "size": size,
113 | }
114 |
115 | except (ValueError, IndexError, UnicodeDecodeError, KeyError):
116 | continue
117 |
118 | elif line.strip() == "Redirected URLs":
119 | # stop processing we're through HTTP requests
120 | break
121 | except Exception as ex:
122 | logger.error(f"Error processing log file {log_latest}: {ex}")
123 |
124 | return log_data, log_http_error_data
125 |
126 | def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
127 | site_id: int, index_state: IndexState = None) -> None:
128 | """
129 | Load a SiteOne directory into the database with parallel processing and batch insertions.
130 |
131 | Args:
132 | connection: SQLite connection
133 | directory: path to the SiteOne directory
134 | site_id: ID for the site
135 | index_state: IndexState object for tracking progress
136 | """
137 |
138 | if not directory.exists() or not directory.is_dir():
139 | logger.error(f"Directory not found or not a directory: {directory}")
140 | return
141 |
142 | if index_state is not None:
143 | index_state.set_status(IndexStatus.INDEXING)
144 |
145 | log_data, log_http_error_data = self._extract_log_metadata(directory)
146 |
147 | file_paths = []
148 | for root, _, files in os.walk(directory):
149 | for filename in files:
150 | if filename == "robots.txt" or (filename.startswith("output.") and filename.endswith(".txt")):
151 | continue
152 | file_paths.append(Path(root) / filename)
153 |
154 | processed_urls = set()
155 |
156 | with closing(connection.cursor()) as cursor:
157 | for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
158 | if index_state is not None and index_state.is_timeout():
159 | index_state.set_status(IndexStatus.PARTIAL)
160 | return
161 |
162 | batch_paths = file_paths[i:i+INDEXED_BATCH_SIZE]
163 | batch_insert_crawled: list[ResourceResult] = []
164 | file_contents = BaseManager.read_files(batch_paths)
165 | for file_path in batch_paths:
166 | try:
167 | result: ResourceResult | None = self._prepare_siteone_record(file_path,
168 | site_id, directory, log_data, file_contents.get(file_path))
169 | if result and result.url not in processed_urls:
170 | batch_insert_crawled.append(result)
171 | processed_urls.add(result.url)
172 | if index_state is not None:
173 | index_state.increment_processed()
174 | except Exception as ex:
175 | logger.error(f"Error processing file {file_path}: {ex}")
176 |
177 | self._execute_batch_insert(connection, cursor, batch_insert_crawled)
178 |
179 | # HTTP errors not already processed
180 | batch_insert_errors: list[ResourceResult] = []
181 | for url, meta in log_http_error_data.items():
182 | if url not in processed_urls:
183 | size = meta.get("size", 0)
184 | result = ResourceResult(
185 | id=BaseManager.string_to_id(url),
186 | site=site_id,
187 | url=url,
188 | type=ResourceResultType.OTHER,
189 | status=meta["status"],
190 | headers=BaseManager.get_basic_headers(size, ResourceResultType.OTHER, file_path),
191 | content="", # no content
192 | size=size, # size from log
193 | time=meta["time"]
194 | )
195 | batch_insert_errors.append(result)
196 |
197 | if index_state is not None:
198 | index_state.increment_processed()
199 |
200 | # errors in batches too
201 | if len(batch_insert_errors) >= INDEXED_BATCH_SIZE:
202 | self._execute_batch_insert(connection, cursor, batch_insert_errors)
203 |
204 | # insert any remaining error records
205 | if batch_insert_errors:
206 | self._execute_batch_insert(connection, cursor, batch_insert_errors)
207 |
208 | if index_state is not None and index_state.status == IndexStatus.INDEXING:
209 | index_state.set_status(IndexStatus.COMPLETE)
210 |
211 | def _prepare_siteone_record(self, file_path: Path, site_id: int, base_dir: Path,
212 | log_data: dict, content: str = None) -> ResourceResult | None:
213 | """
214 | Prepare a record for batch insertion from a SiteOne file.
215 |
216 | Args:
217 | file_path: path to the file
218 | site_id: id for the site
219 | base_dir: base directory for the capture
220 | log_data: dictionary of metadata from logs keyed by URL
221 | content: optional pre-loaded file content
222 |
223 | Returns:
224 | Tuple of (record tuple, URL) or None if processing fails
225 | """
226 | try:
227 | # generate relative url path from file path (similar to wget)
228 | relative_path = file_path.relative_to(base_dir)
229 | url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{base_dir.name}/{str(relative_path).replace(os.sep, '/')}"
230 |
231 | if file_path.is_file():
232 | file_stat = file_path.stat()
233 | file_size = file_stat.st_size
234 | file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
235 | file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
236 | else:
237 | file_created = None
238 | file_modified = None
239 | file_size = 0
240 |
241 | decruftified_path = BaseManager.decruft_path(str(file_path))
242 | extension = Path(decruftified_path).suffix.lower()
243 | wget_static_pattern = re.compile(r"\.[0-9a-f]{8,}\.")
244 |
245 | # look up metadata from log if available, otherwise use defaults
246 | metadata = None
247 | wget_aliases = list(set([
248 | url, # exact match first
249 | re.sub(wget_static_pattern, ".", url), # static pattern
250 | url.replace(".html", ""), # file without extension (redirects)
251 | url.replace(".html", "/"), # directory style (targets)
252 | url.replace("index.html", ""), # index removal
253 | ]))
254 |
255 | for wget_alias in wget_aliases:
256 | metadata = log_data.get(wget_alias, None)
257 | if metadata is not None:
258 | break
259 |
260 | if metadata is not None:
261 | # preventing duplicate html pages ./appstat.html and ./appstat/index.html
262 | # prefer index.html (actual content) over redirect stubs
263 | canonical_url = None
264 | # Sort aliases to prefer index.html files over redirect stubs
265 | sorted_aliases = sorted([alias for alias in wget_aliases if log_data.get(alias) == metadata],
266 | key=lambda x: (not x.endswith('index.html'), x))
267 |
268 | if sorted_aliases:
269 | canonical_url = sorted_aliases[0] # Take the preferred one
270 | url = canonical_url
271 | else:
272 | metadata = {}
273 |
274 | status_code = metadata.get("status", 200)
275 | response_time = metadata.get("time", 0)
276 | log_type = metadata.get("type", "").lower()
277 |
278 | if log_type:
279 | # no type for redirects, but more often than not
280 | # redirection to another page
281 | resource_type = SITEONE_LOG_TYPE_MAPPING.get(log_type,
282 | INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER))
283 | else:
284 | # fallback to extension-based mapping
285 | resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
286 |
287 | file_content = content
288 | if file_content is None:
289 | file_content = BaseManager.read_file_contents(file_path, resource_type)
290 |
291 | # skip redirect stub files left in SiteOne archive (duplicate, wait for real content)
292 | if status_code == 200 and file_content and '<meta http-equiv="refresh" content="0' in file_content:
293 | return None
294 |
295 | record = ResourceResult(
296 | id=BaseManager.string_to_id(url),
297 | site=site_id,
298 | created=file_created,
299 | modified=file_modified,
300 | url=url,
301 | type=resource_type,
302 | status=status_code,
303 | headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
304 | content=file_content,
305 | size=file_size,
306 | time=response_time # possibly from log
307 | )
308 | return record
309 | except Exception as ex:
310 | logger.error(f"Error preparing record for file {file_path}: {ex}")
311 | return None
312 |
313 | manager: SiteOneManager = SiteOneManager()
314 |
315 | def get_sites(
316 | datasrc: Path,
317 | ids: list[int] | None = None,
318 | fields: list[str] | None = None
319 | ) -> list[SiteResult]:
320 | """
321 | List site directories in the datasrc directory as sites.
322 |
323 | Args:
324 | datasrc: path to the directory containing site subdirectories
325 | ids: optional list of site IDs to filter by
326 | fields: optional list of fields to include in the response
327 |
328 | Returns:
329 | List of SiteResult objects, one for each site directory
330 |
331 | Notes:
332 | Returns an empty list if the datasrc directory doesn't exist.
333 | """
334 | return manager.get_sites_for_directories(datasrc, ids, fields)
335 |
336 | def get_resources(
337 | datasrc: Path,
338 | sites: list[int] | None = None,
339 | query: str = "",
340 | fields: list[str] | None = None,
341 | sort: str | None = None,
342 | limit: int = RESOURCES_LIMIT_DEFAULT,
343 | offset: int = 0,
344 | ) -> tuple[list[ResourceResult], int, IndexState]:
345 | """
346 | Get resources from wget directories using in-memory SQLite.
347 |
348 | Args:
349 | datasrc: path to the directory containing wget captures
350 | sites: optional list of site IDs to filter by
351 | query: search query string
352 | fields: optional list of fields to include in response
353 | sort: sort order for results
354 | limit: maximum number of results to return
355 | offset: number of results to skip for pagination
356 |
357 | Returns:
358 | Tuple of (list of ResourceResult objects, total count)
359 | """
360 | sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
361 | assert sites_results, "At least one site is required to search"
362 | site_paths = [site.path for site in sites_results]
363 | sites_group = SitesGroup(datasrc, sites, site_paths)
364 | return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
365 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/parser.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 |
3 | from ply import lex
4 | from ply import yacc
5 | from logging import Logger
6 |
7 | from mcp_server_webcrawl.models.resources import RESOURCES_DEFAULT_FIELD_MAPPING
8 | from mcp_server_webcrawl.utils.logger import get_logger
9 |
10 | logger: Logger = get_logger()
11 |
12 | class SearchSubquery:
13 | """
14 | Subquery component in a structured search.
15 |
16 | These are grouped into an ordered list, and are the basis the SQL query.
17 | """
18 |
19 | def __init__(
20 | self,
21 | field: str | None,
22 | value: str | int,
23 | type: str,
24 | modifiers: list[str] | None,
25 | operator: str | None,
26 | comparator: str = "=",
27 | group: int | None = None,
28 | ):
29 | """
30 | Initialize a SearchSubquery instance.
31 |
32 | Args:
33 | field: field to search, or None for fulltext search
34 | value: search value (string or integer)
35 | type: value type (term, phrase, wildcard, etc.)
36 | modifiers: list of modifiers applied to the query (e.g., 'NOT')
37 | operator: boolean operator connecting to the next subquery ('AND', 'OR', or None)
38 | comparator: comparison operator for numerics ('=', '>', '>=', '<', '<=', '!=')
39 | """
40 | self.field: str | None = field
41 | self.value: str | int = value
42 | self.type: str = type
43 | self.modifiers: list[str] = modifiers or []
44 | self.operator: str | None = operator or None
45 | self.comparator: str = comparator
46 | self.group: int | None = group
47 |
48 | def get_safe_sql_field(self, field: str) -> str:
49 | if field in RESOURCES_DEFAULT_FIELD_MAPPING:
50 | return RESOURCES_DEFAULT_FIELD_MAPPING[field]
51 | else:
52 | logger.error(f"Field {field} failed to validate.")
53 | raise Exception(f"Unknown database field {field}")
54 |
55 | def to_dict(self) -> dict[str, str | int | list[str] | None]:
56 | """
57 | Convert SearchSubquery to dictionary representation.
58 |
59 | Args:
60 | field: Field name to use in the dictionary (overrides self.field)
61 |
62 | Returns:
63 | Dictionary containing all SearchSubquery attributes
64 | """
65 | return {
66 | "field": self.field,
67 | "value": self.value,
68 | "type": self.type,
69 | "modifiers": self.modifiers,
70 | "operator": self.operator,
71 | "comparator": self.comparator,
72 | "group": self.group,
73 | }
74 |
75 | class SearchLexer:
76 | tokens = (
77 | "FIELD", # e.g. url:, content:
78 | "QUOTED_STRING", # "hello world"
79 | "TERM", # standard search term
80 | "WILDCARD", # wildcards terms, e.g. search*
81 | "AND",
82 | "OR",
83 | "NOT",
84 | "LPAREN", # (
85 | "RPAREN", # )
86 | "COLON", # :
87 | "COMPARATOR", # :>=, :>, :<, etc.
88 | "COMP_OP", # >=
89 | "URL_FIELD"
90 | )
91 |
92 | valid_fields: list[str] = ["id", "url", "status", "type", "size", "headers", "content", "time"]
93 |
94 | t_LPAREN = r"\("
95 | t_RPAREN = r"\)"
96 | t_ignore = " \t\n"
97 |
98 | def __init__(self):
99 | self.lexer = lex.lex(module=self)
100 |
101 | def t_COMPARATOR(self, token: lex.LexToken) -> lex.LexToken:
102 | r":(?:>=|>|<=|<|!=|=)"
103 | token.value = token.value[1:] # strip colon
104 | return token
105 |
106 | def t_COLON(self, token: lex.LexToken) -> lex.LexToken:
107 | r":"
108 | return token
109 |
110 | def t_QUOTED_STRING(self, token: lex.LexToken) -> lex.LexToken:
111 | r'"[^"]*"'
112 | token.value = token.value[1:-1]
113 | return token
114 |
115 | # precedence matters
116 | def t_URL_FIELD(self, token: lex.LexToken) -> lex.LexToken:
117 | # this field must terminate not only on url end, but on parens
118 | r"url\s*:\s*((?:https?://)?[^\s()]+)"
119 | token.type = "URL_FIELD"
120 | url_value = token.value[token.value.find(':')+1:].strip()
121 | token.value = ("url", url_value)
122 | return token
123 |
124 | # precedence matters
125 | def t_FIELD(self, token: lex.LexToken) -> lex.LexToken:
126 | r"[a-zA-Z_][a-zA-Z0-9_]*(?=\s*:)"
127 | if token.value not in self.valid_fields:
128 | raise ValueError(f"Invalid field: {token.value}. Valid fields are: {', '.join(self.valid_fields)}")
129 | return token
130 |
131 | def t_AND(self, token: lex.LexToken) -> lex.LexToken:
132 | r"AND\b"
133 | return token
134 |
135 | def t_OR(self, token: lex.LexToken) -> lex.LexToken:
136 | r"OR\b"
137 | return token
138 |
139 | def t_NOT(self, token: lex.LexToken) -> lex.LexToken:
140 | r"NOT\b"
141 | return token
142 |
143 | def t_WILDCARD(self, token: lex.LexToken) -> lex.LexToken:
144 | r"[a-zA-Z0-9_\.\-\/\+]+\*"
145 | token.value = token.value[:-1]
146 | return token
147 |
148 | def t_TERM(self, token: lex.LexToken) -> lex.LexToken:
149 | r"[a-zA-Z0-9_\.\-\/\+]+"
150 | # dedicated t_AND, t_OR, t_NOT to handle those
151 | # this is fts5 workaround, -_ are tokenizer preserves
152 | if re.match(r"^[\w]+[\-_][\-_\w]+$", token.value, re.UNICODE):
153 | token.type = "QUOTED_STRING"
154 | return token
155 |
156 | def t_COMP_OP(self, token: lex.LexToken) -> lex.LexToken:
157 | r">=|>|<=|<|!=|="
158 | return token
159 |
160 | def t_error(self, token: lex.LexToken) -> None:
161 | logger.error(f"Illegal character '{token.value[0]}'")
162 | token.lexer.skip(1)
163 |
164 | class SearchParser:
165 | tokens = SearchLexer.tokens
166 |
167 | precedence = (
168 | ('right', 'NOT'),
169 | ('left', 'AND'),
170 | ('left', 'OR'),
171 | )
172 |
173 | numeric_fields: list[str] = ["id", "status", "size", "time"]
174 |
175 | def __init__(self, lexer):
176 | self.lexer = lexer
177 | self.parser = yacc.yacc(module=self, debug=False)
178 |
179 | def p_query(self, production: yacc.YaccProduction) -> None:
180 | """
181 | query : expression
182 | """
183 | production[0] = production[1]
184 |
185 | def p_expression_binary(self, production: yacc.YaccProduction) -> None:
186 | """
187 | expression : expression AND expression
188 | | expression OR expression
189 | | expression NOT expression
190 | """
191 |
192 | operator = production[2]
193 | left = production[1]
194 | right = production[3]
195 |
196 | # special handling for AND NOT pattern
197 | # A AND (NOT B), treat it like A NOT B
198 | if (operator == "AND" and isinstance(right, list) and
199 | len(right) == 1 and "NOT" in right[0].modifiers):
200 | # convert AND (NOT B) to binary NOT
201 | # remove NOT modifiers
202 | right[0].modifiers = [m for m in right[0].modifiers if m != "NOT"]
203 | operator = "NOT"
204 |
205 | if operator == "NOT":
206 | # NOT handled as set difference, left EXCEPT right
207 | # mark this as a special NOT relationship
208 | if isinstance(left, list) and isinstance(right, list):
209 | if left:
210 | left[-1].operator = "NOT"
211 | production[0] = left + right
212 | elif isinstance(left, list):
213 | if left:
214 | left[-1].operator = "NOT"
215 | production[0] = left + [self.__create_subquery(right, None)]
216 | elif isinstance(right, list):
217 | production[0] = [self.__create_subquery(left, "NOT")] + right
218 | else:
219 | # both terms, subqueries for both
220 | production[0] = [
221 | self.__create_subquery(left, "NOT"),
222 | self.__create_subquery(right, None)
223 | ]
224 | else:
225 | # handle AND and OR as before
226 | if isinstance(left, list) and isinstance(right, list):
227 | if left:
228 | left[-1].operator = operator
229 | production[0] = left + right
230 | elif isinstance(left, list):
231 | if left:
232 | left[-1].operator = operator
233 | production[0] = left + [self.__create_subquery(right, operator)]
234 | elif isinstance(right, list):
235 | production[0] = [self.__create_subquery(left, operator)] + right
236 | else:
237 | production[0] = [
238 | self.__create_subquery(left, operator),
239 | self.__create_subquery(right, None)
240 | ]
241 |
242 | def p_expression_not(self, production: yacc.YaccProduction) -> None:
243 | """
244 | expression : NOT expression
245 | """
246 | # handle unary NOT (prefix NOT)
247 | expr = production[2]
248 | if isinstance(expr, list):
249 | for item in expr:
250 | item.modifiers.append("NOT")
251 | production[0] = expr
252 | else:
253 | subquery = self.__create_subquery(expr, None)
254 | subquery.modifiers.append("NOT")
255 | production[0] = [subquery]
256 |
257 | def p_expression_group(self, production: yacc.YaccProduction) -> None:
258 | """
259 | expression : LPAREN expression RPAREN
260 | """
261 | # production[0] = production[2]
262 | expr = production[2]
263 | group_id = id(production) # Unique ID for this parentheses group
264 |
265 | # Mark all subqueries in this expression with the group
266 | if isinstance(expr, list):
267 | for subquery in expr:
268 | subquery.group = group_id
269 | else:
270 | expr.group = group_id
271 |
272 | production[0] = expr
273 |
274 | def p_expression_url_field(self, production: yacc.YaccProduction) -> None:
275 | """
276 | expression : URL_FIELD
277 | """
278 |
279 | field, value = production[1] # Unpack the tuple (field, value)
280 |
281 | # check if URL ends with * for wildcard matching
282 | value_type = "term"
283 | if value.endswith('*'):
284 | value = value[:-1] # remove wildcard
285 | value_type = "wildcard"
286 |
287 | production[0] = SearchSubquery(
288 | field=field,
289 | value=value,
290 | type=value_type,
291 | modifiers=[],
292 | operator=None
293 | )
294 |
295 | def p_value(self, production: yacc.YaccProduction) -> None:
296 | """
297 | value : TERM
298 | | WILDCARD
299 | | QUOTED_STRING
300 | """
301 | value = production[1]
302 | value_type = "term"
303 |
304 | if production.slice[1].type == "WILDCARD":
305 | value_type = "wildcard"
306 | elif production.slice[1].type == "QUOTED_STRING":
307 | value_type = "phrase"
308 |
309 | production[0] = {"value": value, "type": value_type}
310 |
311 | def p_expression_term(self, production: yacc.YaccProduction) -> None:
312 | """
313 | expression : value
314 | """
315 |
316 | term = production[1]
317 | production[0] = SearchSubquery(
318 | field=None, # no field means fulltext search
319 | value=term["value"],
320 | type=term["type"],
321 | modifiers=[],
322 | operator=None
323 | )
324 |
325 | def p_expression_field_search(self, production: yacc.YaccProduction) -> None:
326 | """
327 | expression : FIELD COLON COMP_OP value
328 | | FIELD COLON value
329 | | FIELD COMPARATOR value
330 | """
331 | field = production[1]
332 |
333 | # determine comparator and value based on pattern
334 | if len(production) == 5: # FIELD COLON COMP_OP value
335 | comparator = production[3]
336 | value = production[4]
337 | elif len(production) == 4:
338 | # check second token, COLON or COMPARATOR
339 | if production[2] == ":": # FIELD COLON value
340 | comparator = "=" # default equals
341 | value = production[3]
342 | else:
343 | comparator = production[2]
344 | value = production[3]
345 |
346 | production[0] = self.__create_field_subquery(field, value, comparator)
347 |
348 | def __create_field_subquery(self, field: str, value_dict: dict[str, str] | str | int, comparator: str = "=") -> SearchSubquery:
349 | """
350 | Helper method to create SearchSubquery for field searches.
351 | Consolidates all the validation and conversion logic.
352 | """
353 |
354 | self.__validate_comparator_for_field(field, comparator)
355 | processed_value = self.__process_field_value(field, value_dict)
356 | value_type = value_dict.get("type", "term") if isinstance(value_dict, dict) else "term"
357 |
358 | return SearchSubquery(
359 | field=field,
360 | value=processed_value,
361 | type=value_type,
362 | modifiers=[],
363 | operator=None,
364 | comparator=comparator
365 | )
366 |
367 | def __create_subquery(self, term, operator: str | None):
368 | """
369 | Helper to create a SearchSubquery instance.
370 | """
371 | assert isinstance(term, SearchSubquery), "__create_subquery expected a SearchSubquery instance"
372 | return SearchSubquery(
373 | field=term.field,
374 | value=term.value,
375 | type=term.type,
376 | modifiers=term.modifiers.copy(),
377 | operator=operator,
378 | comparator=term.comparator,
379 | group=term.group,
380 | )
381 |
382 | def __process_field_value(
383 | self,
384 | field: str | None,
385 | value_dict: dict[str, str] | str | int,
386 | swap_values: dict[str, dict[str, str | int]] | None = None
387 | ) -> str | int | float:
388 | """
389 | Process and validate a field value with type conversion and swapping.
390 |
391 | Args:
392 | field: The field name (or None for fulltext)
393 | value_dict: Dictionary with 'value' and 'type' keys, or raw value
394 | swap_values: Optional dictionary for value replacement
395 |
396 | Returns:
397 | Processed value (string, int, or float)
398 | """
399 | if isinstance(value_dict, dict):
400 | value = value_dict["value"]
401 | else:
402 | value = value_dict # raw value
403 |
404 | if swap_values:
405 | swap_key = field if field else ""
406 | if swap_key in swap_values and value in swap_values[swap_key]:
407 | value = swap_values[swap_key][value]
408 |
409 | if field and field in self.numeric_fields:
410 | try:
411 | return int(value)
412 | except ValueError:
413 | try:
414 | return float(value)
415 | except ValueError:
416 | raise ValueError(f"Field {field} requires a numeric value, got: {value}")
417 |
418 | return value
419 |
420 | def __validate_comparator_for_field(self, field: str, comparator: str) -> None:
421 | """
422 | Validate that a comparator is appropriate for the given field.
423 |
424 | Args:
425 | field: The field name
426 | comparator: The comparison operator
427 |
428 | Raises:
429 | ValueError: If comparator is invalid for the field type
430 | """
431 | if comparator != "=" and field not in self.numeric_fields:
432 | raise ValueError(f"Comparison operator '{comparator}' can only be used with numeric fields")
433 |
434 | def p_error(self, production: yacc.YaccProduction | None) -> None:
435 | if production:
436 | logger.info(f"Syntax error at '{production.value}'")
437 | else:
438 | logger.info("Syntax error at EOF")
439 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/requirements.py:
--------------------------------------------------------------------------------
```python
1 | import curses
2 | import os
3 | import traceback
4 |
5 | from enum import Enum, auto
6 | from pathlib import Path
7 | from typing import TYPE_CHECKING
8 |
9 | from mcp_server_webcrawl.crawlers import VALID_CRAWLER_CHOICES, get_crawler
10 | from mcp_server_webcrawl.crawlers.base.api import BaseJsonApi
11 | from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
12 | from mcp_server_webcrawl.interactive.ui import InputRadioGroup, InputText, ThemeDefinition, UiState
13 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView
14 | from mcp_server_webcrawl.interactive.ui import safe_addstr
15 | from mcp_server_webcrawl.interactive.views.searchform import SearchFormView
16 |
17 | if TYPE_CHECKING:
18 | from mcp_server_webcrawl.interactive.session import InteractiveSession
19 |
20 | LAYOUT_BOX_MAX_WIDTH = 60
21 | LAYOUT_BOX_MARGIN = 8
22 | VALIDATION_HEADER_X_OFFSET = 24
23 | VALIDATION_TEXT_INDENT = 2
24 |
25 | class RequirementsFormField(Enum):
26 | DATASRC = auto()
27 | CRAWLER = auto()
28 |
29 | class RequirementsView(BaseCursesView):
30 | """
31 | Interactive requirements view for configuring crawler and data source.
32 | """
33 |
34 | def __init__(self, session: 'InteractiveSession', crawler: str, datasrc: str):
35 | """
36 | Initialize the requirements view.
37 |
38 | Args:
39 | session: The interactive session instance
40 | crawler: Initial crawler type selection
41 | datasrc: Initial data source path
42 | """
43 | super().__init__(session)
44 | self.__validated: bool = self.__validate(crawler, datasrc)
45 | self.__form_selected_field: RequirementsFormField = RequirementsFormField.DATASRC
46 | self.__form_selected_index: int = 0
47 |
48 | initial_datasrc: str = datasrc if datasrc is not None else self.__get_default_directory()
49 | self.__datasrc_input: InputText = InputText(initial_value=initial_datasrc, label="Data Source Path")
50 |
51 | self.__crawler_group: InputRadioGroup = InputRadioGroup("crawler")
52 |
53 | if not self.__validated:
54 | detected_crawler: str | None
55 | detected_datasrc: str | None
56 | detected_crawler, detected_datasrc = self.__autosense_crawler_and_datasrc()
57 | initial_crawler: str = crawler if crawler is not None else detected_crawler
58 | initial_datasrc = datasrc if datasrc is not None else detected_datasrc
59 | self.__set_initial_crawler_selection(initial_crawler)
60 | self.__datasrc_input.set_value(initial_datasrc)
61 | self._focused: bool = True
62 |
63 | @property
64 | def validated(self) -> bool:
65 | return self.__validated
66 |
67 | def handle_input(self, key: int) -> bool:
68 | """
69 | Handle keyboard input for requirements form navigation and validation.
70 |
71 | Args:
72 | key: The curses key code from user input
73 |
74 | Returns:
75 | bool: True if the input was handled, False otherwise
76 | """
77 |
78 | handlers: dict[int, callable] = {
79 | curses.KEY_UP: self.__navigate_form_selection_up,
80 | curses.KEY_DOWN: self.__navigate_form_selection_down,
81 | ord('\t'): self.__handle_tab,
82 | ord(' '): self.__handle_spacebar,
83 | ord('\n'): self.__handle_enter,
84 | ord('\r'): self.__handle_enter,
85 | }
86 |
87 | handler = handlers.get(key)
88 | if handler:
89 | handler()
90 | return True
91 |
92 | if (self.__form_selected_field == RequirementsFormField.DATASRC and
93 | self.__form_selected_index == 0):
94 | return self.__datasrc_input.handle_input(key)
95 |
96 | return False
97 |
98 | def render(self, stdscr: curses.window) -> None:
99 | """
100 | Render the requirements form showing crawler selection and datasrc input.
101 |
102 | Args:
103 | stdscr: The curses window to draw on
104 | """
105 | xb: int = self.bounds.x
106 | yb: int = self.bounds.y
107 | y_current: int = yb + 2
108 | # y_max: int = yb + self.bounds.height
109 |
110 | safe_addstr(stdscr, y_current, xb + 2, "Data Source Path:", curses.A_BOLD)
111 | y_current += 1
112 |
113 | box_width: int = min(LAYOUT_BOX_MAX_WIDTH, self.bounds.width - LAYOUT_BOX_MARGIN)
114 | is_datasrc_selected: bool = (
115 | self.__form_selected_field == RequirementsFormField.DATASRC
116 | and self.__form_selected_index == 0
117 | )
118 | field_style: int
119 | if is_datasrc_selected:
120 | field_style = curses.A_REVERSE
121 | else:
122 | field_style = self.session.get_theme_color_pair(ThemeDefinition.INACTIVE_QUERY)
123 |
124 | self.__datasrc_input.render(stdscr, y_current, xb + 4, box_width,
125 | focused=is_datasrc_selected, style=field_style)
126 |
127 | y_current += 2
128 |
129 | crawler_y_start: int = y_current
130 |
131 | safe_addstr(stdscr, y_current, xb + 2, self.__crawler_group.label, curses.A_BOLD)
132 | y_current += 1
133 |
134 | for i, radio in enumerate(self.__crawler_group.radios):
135 | crawler_field_index: int = i + 1
136 | is_crawler_field_selected: bool = (self.__form_selected_field == RequirementsFormField.CRAWLER and
137 | self.__form_selected_index == crawler_field_index)
138 |
139 | radio.render(stdscr, y_current, xb + 4, crawler_field_index, 100, is_crawler_field_selected)
140 | y_current += 1
141 |
142 | validation_y: int = crawler_y_start
143 |
144 | selected_crawler: str = self.__crawler_group.value
145 | crawler_valid: bool = selected_crawler in VALID_CRAWLER_CHOICES
146 | crawler_symbol: str = "🗹" if crawler_valid else "☒"
147 |
148 | crawler_style: int
149 | if crawler_valid:
150 | crawler_style = curses.A_NORMAL
151 | else:
152 | crawler_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
153 |
154 | datasrc_path: str = self.__datasrc_input.value
155 | datasrc_path_obj: Path = Path(datasrc_path)
156 | datasrc_exists: bool = datasrc_path_obj.exists()
157 |
158 | datasrc_symbol: str
159 | datasrc_valid: bool
160 | if not datasrc_exists:
161 | datasrc_symbol = "☒"
162 | datasrc_valid = False
163 | else:
164 | is_correct_type: bool
165 | if selected_crawler in ("interrobot", "warc"):
166 | is_correct_type = datasrc_path_obj.is_file()
167 | else:
168 | is_correct_type = datasrc_path_obj.is_dir()
169 |
170 | datasrc_symbol = "🗹" if is_correct_type else "☒"
171 | datasrc_valid = is_correct_type
172 |
173 | datasrc_style: int
174 | if datasrc_valid:
175 | datasrc_style = curses.A_NORMAL
176 | else:
177 | datasrc_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
178 |
179 | validation_header: str = "Validation Status:"
180 | header_x: int = xb + VALIDATION_HEADER_X_OFFSET
181 | safe_addstr(stdscr, validation_y, header_x, validation_header, curses.A_BOLD)
182 | validation_y += 1
183 |
184 | validation_word_x: int = header_x
185 | crawler_text: str = f"{crawler_symbol} --crawler"
186 | safe_addstr(stdscr, validation_y, validation_word_x, " ", curses.A_NORMAL)
187 | safe_addstr(stdscr, validation_y, validation_word_x + VALIDATION_TEXT_INDENT, crawler_text, crawler_style)
188 | validation_y += 1
189 |
190 | datasrc_text: str = f"{datasrc_symbol} --datasrc"
191 | safe_addstr(stdscr, validation_y, validation_word_x, " ", curses.A_NORMAL)
192 | safe_addstr(stdscr, validation_y, validation_word_x + VALIDATION_TEXT_INDENT, datasrc_text, datasrc_style)
193 |
194 | def __autosense_crawler_and_datasrc(self) -> tuple[str, str] | tuple[None, None]:
195 | """
196 | Auto-detect crawler type and datasrc based on cwd and parent directory signatures.
197 |
198 | Returns:
199 | tuple: (crawler, datasrc) tuple or (None, None) if no match found
200 | """
201 | cwd: Path = Path(os.getcwd()).absolute()
202 |
203 | if list(cwd.glob("*.v2.db")):
204 | db_file: Path = next(cwd.glob("*.v2.db"))
205 | return ("interrobot", str(db_file))
206 |
207 | archive_directories: list[Path] = list(cwd.glob("*/archive"))
208 | if archive_directories:
209 | for archive_directory in archive_directories:
210 | timestamp_directories: list[Path] = [d for d in archive_directory.iterdir()
211 | if d.is_dir() and d.name.replace('.', '').isdigit()]
212 | if timestamp_directories:
213 | return ("archivebox", str(cwd))
214 |
215 | if list(cwd.glob("*/output.*.txt")):
216 | return ("siteone", str(cwd))
217 |
218 | if list(cwd.glob("*/hts-log.txt")) or list(cwd.glob("*/*/hts-log.txt")):
219 | return ("httrack", str(cwd))
220 |
221 | katana_files: list[Path] = list(cwd.glob("*/*/*.txt"))
222 | for f in katana_files:
223 | if len(f.stem) == 40 and all(c in '0123456789abcdef' for c in f.stem.lower()):
224 | return ("katana", str(cwd))
225 |
226 | warc_files: list[Path] = list(cwd.glob("*.warc.gz")) + list(cwd.glob("*.warc"))
227 | if warc_files:
228 | return ("warc", str(cwd))
229 |
230 | if list(cwd.glob("*/index.html")):
231 | return ("wget", str(cwd))
232 |
233 | return ("wget", self.__get_default_directory())
234 |
235 | def __get_default_directory(self) -> str:
236 | """
237 | Get the default directory path.
238 |
239 | Returns:
240 | str: The absolute path of the current working directory
241 | """
242 | return str(Path(os.getcwd()).absolute())
243 |
244 | def __handle_enter(self) -> None:
245 | """
246 | Handle ENTER key to revalidate in datasrc field or toggle in crawler field.
247 | """
248 | if self.__form_selected_field == RequirementsFormField.DATASRC:
249 | selected_crawler: str = self.__crawler_group.value
250 | self.__validated = self.__validate(selected_crawler, self.__datasrc_input.value)
251 | self.__update_session()
252 | if self.__validated:
253 | self.session.set_ui_state(UiState.SEARCH_INIT)
254 | elif self.__form_selected_field == RequirementsFormField.CRAWLER:
255 | crawler_index: int = self.__form_selected_index - 1
256 | if 0 <= crawler_index < len(self.__crawler_group.radios):
257 | self.__crawler_group.radios[crawler_index].next_state()
258 |
259 | def __handle_spacebar(self) -> None:
260 | """
261 | Handle spacebar to toggle crawler selection or add space to datasrc.
262 | """
263 | if self.__form_selected_field == RequirementsFormField.DATASRC:
264 | self.__datasrc_input.handle_input(ord(" "))
265 | elif self.__form_selected_field == RequirementsFormField.CRAWLER:
266 | crawler_index: int = self.__form_selected_index - 1
267 | if 0 <= crawler_index < len(self.__crawler_group.radios):
268 | self.__crawler_group.radios[crawler_index].next_state()
269 |
270 | def __handle_tab(self) -> None:
271 | """
272 | Handle TAB key to switch between field groups.
273 | """
274 | if self.__form_selected_field == RequirementsFormField.DATASRC:
275 | self.__form_selected_field = RequirementsFormField.CRAWLER
276 | self.__form_selected_index = 1
277 | else:
278 | self.__form_selected_field = RequirementsFormField.DATASRC
279 | self.__form_selected_index = 0
280 |
281 | def __navigate_form_selection_down(self) -> None:
282 | """
283 | Navigate down within current field or switch to next field group.
284 | """
285 | if self.__form_selected_field == RequirementsFormField.DATASRC:
286 | self.__form_selected_field = RequirementsFormField.CRAWLER
287 | self.__form_selected_index = 1
288 | elif self.__form_selected_field == RequirementsFormField.CRAWLER:
289 | if self.__form_selected_index < len(self.__crawler_group.radios):
290 | self.__form_selected_index += 1
291 | else:
292 | self.__form_selected_field = RequirementsFormField.DATASRC
293 | self.__form_selected_index = 0
294 |
295 | def __navigate_form_selection_up(self) -> None:
296 | """
297 | Navigate up within current field or switch to previous field group.
298 | """
299 | if self.__form_selected_field == RequirementsFormField.DATASRC:
300 | self.__form_selected_field = RequirementsFormField.CRAWLER
301 | self.__form_selected_index = len(self.__crawler_group.radios)
302 | elif self.__form_selected_field == RequirementsFormField.CRAWLER:
303 | if self.__form_selected_index > 1:
304 | self.__form_selected_index -= 1
305 | else:
306 | self.__form_selected_field = RequirementsFormField.DATASRC
307 | self.__form_selected_index = 0
308 |
309 | def __set_initial_crawler_selection(self, initial_crawler: str) -> None:
310 | """
311 | Set the initial crawler selection in the radio group.
312 |
313 | Args:
314 | initial_crawler: The crawler type to initially select
315 | """
316 | if initial_crawler in VALID_CRAWLER_CHOICES:
317 | crawler_index: int = VALID_CRAWLER_CHOICES.index(initial_crawler)
318 | if 0 <= crawler_index < len(self.__crawler_group.radios):
319 | self.__crawler_group.radios[crawler_index].next_state()
320 |
321 | def __update_session(self) -> None:
322 | """
323 | Update the session with current form values.
324 | """
325 | # push a new app configuration into the ui
326 | selected_crawler: str = self.__crawler_group.value
327 | self.session.set_init_input_args(selected_crawler, self.__datasrc_input.value)
328 | if self.__validated:
329 | try:
330 | crawl_model: BaseCrawler = get_crawler(selected_crawler)
331 | crawler: BaseCrawler = crawl_model(Path(self.__datasrc_input.value))
332 | self.session.set_init_crawler(crawler)
333 | sites_api: BaseJsonApi = self.session.crawler.get_sites_api()
334 | self.session.set_init_sites(sites_api.get_results())
335 | searchform: SearchFormView = SearchFormView(
336 | self.session,
337 | self.session.sites
338 | )
339 | self.session.set_init_searchform(searchform)
340 | except Exception as ex:
341 | self.session.debug_add(f"Error initializing crawler: {ex}\n{traceback.format_exc()}")
342 | self.__validated = False
343 |
344 | def __validate(self, crawler: str, datasrc: str) -> bool:
345 | """
346 | Validate crawler and datasrc combination.
347 |
348 | Args:
349 | crawler: The crawler type to validate
350 | datasrc: The data source path to validate
351 |
352 | Returns:
353 | bool: True if the combination is valid, False otherwise
354 | """
355 | if not isinstance(datasrc, str) or not isinstance(crawler, str):
356 | return False
357 |
358 | crawler_valid: bool = crawler in VALID_CRAWLER_CHOICES
359 |
360 | if datasrc in (None, ""):
361 | return False
362 |
363 | datasrc_path: Path = Path(datasrc)
364 | if not datasrc_path.exists():
365 | return False
366 |
367 | if crawler in ("interrobot", "warc"):
368 | datasrc_valid = datasrc_path.is_file()
369 | else:
370 | datasrc_valid = datasrc_path.is_dir()
371 |
372 | return crawler_valid and datasrc_valid
373 |
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/wget/tests.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.wget.tests — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../../../../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
67 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
68 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.wget.tests</li>
69 | <li class="wy-breadcrumbs-aside">
70 | </li>
71 | </ul>
72 | <hr/>
73 | </div>
74 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
75 | <div itemprop="articleBody">
76 |
77 | <h1>Source code for mcp_server_webcrawl.crawlers.wget.tests</h1><div class="highlight"><pre>
78 | <span></span><span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.adapter</span> <span class="kn">import</span> <span class="n">WgetManager</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
83 |
84 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
85 |
86 | <span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="n">WgetManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"example.com"</span><span class="p">)</span>
87 | <span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="n">WgetManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"pragmar.com"</span><span class="p">)</span>
88 |
89 | <div class="viewcode-block" id="WgetTests">
90 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests">[docs]</a>
91 | <span class="k">class</span> <span class="nc">WgetTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
92 | <span class="w"> </span><span class="sd">"""</span>
93 | <span class="sd"> Test suite for the wget crawler implementation.</span>
94 | <span class="sd"> Uses all wrapped test methods from BaseCrawlerTests.</span>
95 | <span class="sd"> """</span>
96 |
97 | <div class="viewcode-block" id="WgetTests.setUp">
98 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.setUp">[docs]</a>
99 | <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
100 | <span class="w"> </span><span class="sd">"""</span>
101 | <span class="sd"> Set up the test environment with fixture data.</span>
102 | <span class="sd"> """</span>
103 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
104 | <span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"wget"</span></div>
105 |
106 |
107 | <div class="viewcode-block" id="WgetTests.test_wget_pulse">
108 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_pulse">[docs]</a>
109 | <span class="k">def</span> <span class="nf">test_wget_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
110 | <span class="w"> </span><span class="sd">"""</span>
111 | <span class="sd"> Test basic crawler initialization.</span>
112 | <span class="sd"> """</span>
113 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
114 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
115 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
116 |
117 |
118 | <div class="viewcode-block" id="WgetTests.test_wget_sites">
119 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sites">[docs]</a>
120 | <span class="k">def</span> <span class="nf">test_wget_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
121 | <span class="w"> </span><span class="sd">"""</span>
122 | <span class="sd"> Test site retrieval API functionality.</span>
123 | <span class="sd"> """</span>
124 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
125 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
126 |
127 |
128 | <div class="viewcode-block" id="WgetTests.test_wget_search">
129 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_search">[docs]</a>
130 | <span class="k">def</span> <span class="nf">test_wget_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
131 | <span class="w"> </span><span class="sd">"""</span>
132 | <span class="sd"> Test boolean search functionality</span>
133 | <span class="sd"> """</span>
134 | <span class="c1"># moved fixtures to own repo, lost some local media,</span>
135 | <span class="c1"># but checks out. wget fixture has no CSS/JS/etc.</span>
136 | <span class="c1"># HTML-only and just doesn't do well with the full array of</span>
137 | <span class="c1"># tests concerning fulltext, media, and mixed search result</span>
138 | <span class="c1"># counts. probably needs a reduced set of tests</span>
139 | <span class="c1"># self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)</span>
140 | <span class="k">return</span></div>
141 |
142 |
143 | <div class="viewcode-block" id="WgetTests.test_wget_resources">
144 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_resources">[docs]</a>
145 | <span class="k">def</span> <span class="nf">test_wget_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
146 | <span class="w"> </span><span class="sd">"""</span>
147 | <span class="sd"> Test resource retrieval API functionality with various parameters.</span>
148 | <span class="sd"> """</span>
149 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
150 | <span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
151 |
152 |
153 |
154 | <div class="viewcode-block" id="WgetTests.test_wget_sorts">
155 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sorts">[docs]</a>
156 | <span class="k">def</span> <span class="nf">test_wget_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
157 | <span class="w"> </span><span class="sd">"""</span>
158 | <span class="sd"> Test random sort functionality using the '?' sort parameter.</span>
159 | <span class="sd"> """</span>
160 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
161 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
162 |
163 |
164 | <div class="viewcode-block" id="WgetTests.test_wget_content_parsing">
165 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_content_parsing">[docs]</a>
166 | <span class="k">def</span> <span class="nf">test_wget_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
167 | <span class="w"> </span><span class="sd">"""</span>
168 | <span class="sd"> Test content type detection and parsing.</span>
169 | <span class="sd"> """</span>
170 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
171 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
172 |
173 |
174 | <div class="viewcode-block" id="WgetTests.test_report">
175 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_report">[docs]</a>
176 | <span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
177 | <span class="w"> </span><span class="sd">"""</span>
178 | <span class="sd"> Run test report, save to data directory.</span>
179 | <span class="sd"> """</span>
180 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
181 | <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">"wget"</span><span class="p">))</span></div>
182 | </div>
183 |
184 | </pre></div>
185 |
186 | </div>
187 | </div>
188 | <footer>
189 |
190 | <hr/>
191 |
192 | <div role="contentinfo">
193 | <p>© Copyright 2025, pragmar.</p>
194 | </div>
195 |
196 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
197 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
198 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
199 |
200 |
201 | </footer>
202 | </div>
203 | </div>
204 | </section>
205 | </div>
206 | <script>
207 | jQuery(function () {
208 | SphinxRtdTheme.Navigation.enable(true);
209 | });
210 | </script>
211 |
212 | </body>
213 | </html>
```
--------------------------------------------------------------------------------
/docs/_static/basic.css:
--------------------------------------------------------------------------------
```css
1 | /*
2 | * basic.css
3 | * ~~~~~~~~~
4 | *
5 | * Sphinx stylesheet -- basic theme.
6 | *
7 | * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
8 | * :license: BSD, see LICENSE for details.
9 | *
10 | */
11 |
12 | /* -- main layout ----------------------------------------------------------- */
13 |
14 | div.clearer {
15 | clear: both;
16 | }
17 |
18 | div.section::after {
19 | display: block;
20 | content: '';
21 | clear: left;
22 | }
23 |
24 | /* -- relbar ---------------------------------------------------------------- */
25 |
26 | div.related {
27 | width: 100%;
28 | font-size: 90%;
29 | }
30 |
31 | div.related h3 {
32 | display: none;
33 | }
34 |
35 | div.related ul {
36 | margin: 0;
37 | padding: 0 0 0 10px;
38 | list-style: none;
39 | }
40 |
41 | div.related li {
42 | display: inline;
43 | }
44 |
45 | div.related li.right {
46 | float: right;
47 | margin-right: 5px;
48 | }
49 |
50 | /* -- sidebar --------------------------------------------------------------- */
51 |
52 | div.sphinxsidebarwrapper {
53 | padding: 10px 5px 0 10px;
54 | }
55 |
56 | div.sphinxsidebar {
57 | float: left;
58 | width: 230px;
59 | margin-left: -100%;
60 | font-size: 90%;
61 | word-wrap: break-word;
62 | overflow-wrap : break-word;
63 | }
64 |
65 | div.sphinxsidebar ul {
66 | list-style: none;
67 | }
68 |
69 | div.sphinxsidebar ul ul,
70 | div.sphinxsidebar ul.want-points {
71 | margin-left: 20px;
72 | list-style: square;
73 | }
74 |
75 | div.sphinxsidebar ul ul {
76 | margin-top: 0;
77 | margin-bottom: 0;
78 | }
79 |
80 | div.sphinxsidebar form {
81 | margin-top: 10px;
82 | }
83 |
84 | div.sphinxsidebar input {
85 | border: 1px solid #98dbcc;
86 | font-family: sans-serif;
87 | font-size: 1em;
88 | }
89 |
90 | div.sphinxsidebar #searchbox form.search {
91 | overflow: hidden;
92 | }
93 |
94 | div.sphinxsidebar #searchbox input[type="text"] {
95 | float: left;
96 | width: 80%;
97 | padding: 0.25em;
98 | box-sizing: border-box;
99 | }
100 |
101 | div.sphinxsidebar #searchbox input[type="submit"] {
102 | float: left;
103 | width: 20%;
104 | border-left: none;
105 | padding: 0.25em;
106 | box-sizing: border-box;
107 | }
108 |
109 |
110 | img {
111 | border: 0;
112 | max-width: 100%;
113 | }
114 |
115 | /* -- search page ----------------------------------------------------------- */
116 |
117 | ul.search {
118 | margin: 10px 0 0 20px;
119 | padding: 0;
120 | }
121 |
122 | ul.search li {
123 | padding: 5px 0 5px 20px;
124 | background-image: url(file.png);
125 | background-repeat: no-repeat;
126 | background-position: 0 7px;
127 | }
128 |
129 | ul.search li a {
130 | font-weight: bold;
131 | }
132 |
133 | ul.search li p.context {
134 | color: #888;
135 | margin: 2px 0 0 30px;
136 | text-align: left;
137 | }
138 |
139 | ul.keywordmatches li.goodmatch a {
140 | font-weight: bold;
141 | }
142 |
143 | /* -- index page ------------------------------------------------------------ */
144 |
145 | table.contentstable {
146 | width: 90%;
147 | margin-left: auto;
148 | margin-right: auto;
149 | }
150 |
151 | table.contentstable p.biglink {
152 | line-height: 150%;
153 | }
154 |
155 | a.biglink {
156 | font-size: 1.3em;
157 | }
158 |
159 | span.linkdescr {
160 | font-style: italic;
161 | padding-top: 5px;
162 | font-size: 90%;
163 | }
164 |
165 | /* -- general index --------------------------------------------------------- */
166 |
167 | table.indextable {
168 | width: 100%;
169 | }
170 |
171 | table.indextable td {
172 | text-align: left;
173 | vertical-align: top;
174 | }
175 |
176 | table.indextable ul {
177 | margin-top: 0;
178 | margin-bottom: 0;
179 | list-style-type: none;
180 | }
181 |
182 | table.indextable > tbody > tr > td > ul {
183 | padding-left: 0em;
184 | }
185 |
186 | table.indextable tr.pcap {
187 | height: 10px;
188 | }
189 |
190 | table.indextable tr.cap {
191 | margin-top: 10px;
192 | background-color: #f2f2f2;
193 | }
194 |
195 | img.toggler {
196 | margin-right: 3px;
197 | margin-top: 3px;
198 | cursor: pointer;
199 | }
200 |
201 | div.modindex-jumpbox {
202 | border-top: 1px solid #ddd;
203 | border-bottom: 1px solid #ddd;
204 | margin: 1em 0 1em 0;
205 | padding: 0.4em;
206 | }
207 |
208 | div.genindex-jumpbox {
209 | border-top: 1px solid #ddd;
210 | border-bottom: 1px solid #ddd;
211 | margin: 1em 0 1em 0;
212 | padding: 0.4em;
213 | }
214 |
215 | /* -- domain module index --------------------------------------------------- */
216 |
217 | table.modindextable td {
218 | padding: 2px;
219 | border-collapse: collapse;
220 | }
221 |
222 | /* -- general body styles --------------------------------------------------- */
223 |
224 | div.body {
225 | min-width: 360px;
226 | max-width: 800px;
227 | }
228 |
229 | div.body p, div.body dd, div.body li, div.body blockquote {
230 | -moz-hyphens: auto;
231 | -ms-hyphens: auto;
232 | -webkit-hyphens: auto;
233 | hyphens: auto;
234 | }
235 |
236 | a.headerlink {
237 | visibility: hidden;
238 | }
239 |
240 | a:visited {
241 | color: #551A8B;
242 | }
243 |
244 | h1:hover > a.headerlink,
245 | h2:hover > a.headerlink,
246 | h3:hover > a.headerlink,
247 | h4:hover > a.headerlink,
248 | h5:hover > a.headerlink,
249 | h6:hover > a.headerlink,
250 | dt:hover > a.headerlink,
251 | caption:hover > a.headerlink,
252 | p.caption:hover > a.headerlink,
253 | div.code-block-caption:hover > a.headerlink {
254 | visibility: visible;
255 | }
256 |
257 | div.body p.caption {
258 | text-align: inherit;
259 | }
260 |
261 | div.body td {
262 | text-align: left;
263 | }
264 |
265 | .first {
266 | margin-top: 0 !important;
267 | }
268 |
269 | p.rubric {
270 | margin-top: 30px;
271 | font-weight: bold;
272 | }
273 |
274 | img.align-left, figure.align-left, .figure.align-left, object.align-left {
275 | clear: left;
276 | float: left;
277 | margin-right: 1em;
278 | }
279 |
280 | img.align-right, figure.align-right, .figure.align-right, object.align-right {
281 | clear: right;
282 | float: right;
283 | margin-left: 1em;
284 | }
285 |
286 | img.align-center, figure.align-center, .figure.align-center, object.align-center {
287 | display: block;
288 | margin-left: auto;
289 | margin-right: auto;
290 | }
291 |
292 | img.align-default, figure.align-default, .figure.align-default {
293 | display: block;
294 | margin-left: auto;
295 | margin-right: auto;
296 | }
297 |
298 | .align-left {
299 | text-align: left;
300 | }
301 |
302 | .align-center {
303 | text-align: center;
304 | }
305 |
306 | .align-default {
307 | text-align: center;
308 | }
309 |
310 | .align-right {
311 | text-align: right;
312 | }
313 |
314 | /* -- sidebars -------------------------------------------------------------- */
315 |
316 | div.sidebar,
317 | aside.sidebar {
318 | margin: 0 0 0.5em 1em;
319 | border: 1px solid #ddb;
320 | padding: 7px;
321 | background-color: #ffe;
322 | width: 40%;
323 | float: right;
324 | clear: right;
325 | overflow-x: auto;
326 | }
327 |
328 | p.sidebar-title {
329 | font-weight: bold;
330 | }
331 |
332 | nav.contents,
333 | aside.topic,
334 | div.admonition, div.topic, blockquote {
335 | clear: left;
336 | }
337 |
338 | /* -- topics ---------------------------------------------------------------- */
339 |
340 | nav.contents,
341 | aside.topic,
342 | div.topic {
343 | border: 1px solid #ccc;
344 | padding: 7px;
345 | margin: 10px 0 10px 0;
346 | }
347 |
348 | p.topic-title {
349 | font-size: 1.1em;
350 | font-weight: bold;
351 | margin-top: 10px;
352 | }
353 |
354 | /* -- admonitions ----------------------------------------------------------- */
355 |
356 | div.admonition {
357 | margin-top: 10px;
358 | margin-bottom: 10px;
359 | padding: 7px;
360 | }
361 |
362 | div.admonition dt {
363 | font-weight: bold;
364 | }
365 |
366 | p.admonition-title {
367 | margin: 0px 10px 5px 0px;
368 | font-weight: bold;
369 | }
370 |
371 | div.body p.centered {
372 | text-align: center;
373 | margin-top: 25px;
374 | }
375 |
376 | /* -- content of sidebars/topics/admonitions -------------------------------- */
377 |
378 | div.sidebar > :last-child,
379 | aside.sidebar > :last-child,
380 | nav.contents > :last-child,
381 | aside.topic > :last-child,
382 | div.topic > :last-child,
383 | div.admonition > :last-child {
384 | margin-bottom: 0;
385 | }
386 |
387 | div.sidebar::after,
388 | aside.sidebar::after,
389 | nav.contents::after,
390 | aside.topic::after,
391 | div.topic::after,
392 | div.admonition::after,
393 | blockquote::after {
394 | display: block;
395 | content: '';
396 | clear: both;
397 | }
398 |
399 | /* -- tables ---------------------------------------------------------------- */
400 |
401 | table.docutils {
402 | margin-top: 10px;
403 | margin-bottom: 10px;
404 | border: 0;
405 | border-collapse: collapse;
406 | }
407 |
408 | table.align-center {
409 | margin-left: auto;
410 | margin-right: auto;
411 | }
412 |
413 | table.align-default {
414 | margin-left: auto;
415 | margin-right: auto;
416 | }
417 |
418 | table caption span.caption-number {
419 | font-style: italic;
420 | }
421 |
422 | table caption span.caption-text {
423 | }
424 |
425 | table.docutils td, table.docutils th {
426 | padding: 1px 8px 1px 5px;
427 | border-top: 0;
428 | border-left: 0;
429 | border-right: 0;
430 | border-bottom: 1px solid #aaa;
431 | }
432 |
433 | th {
434 | text-align: left;
435 | padding-right: 5px;
436 | }
437 |
438 | table.citation {
439 | border-left: solid 1px gray;
440 | margin-left: 1px;
441 | }
442 |
443 | table.citation td {
444 | border-bottom: none;
445 | }
446 |
447 | th > :first-child,
448 | td > :first-child {
449 | margin-top: 0px;
450 | }
451 |
452 | th > :last-child,
453 | td > :last-child {
454 | margin-bottom: 0px;
455 | }
456 |
457 | /* -- figures --------------------------------------------------------------- */
458 |
459 | div.figure, figure {
460 | margin: 0.5em;
461 | padding: 0.5em;
462 | }
463 |
464 | div.figure p.caption, figcaption {
465 | padding: 0.3em;
466 | }
467 |
468 | div.figure p.caption span.caption-number,
469 | figcaption span.caption-number {
470 | font-style: italic;
471 | }
472 |
473 | div.figure p.caption span.caption-text,
474 | figcaption span.caption-text {
475 | }
476 |
477 | /* -- field list styles ----------------------------------------------------- */
478 |
479 | table.field-list td, table.field-list th {
480 | border: 0 !important;
481 | }
482 |
483 | .field-list ul {
484 | margin: 0;
485 | padding-left: 1em;
486 | }
487 |
488 | .field-list p {
489 | margin: 0;
490 | }
491 |
492 | .field-name {
493 | -moz-hyphens: manual;
494 | -ms-hyphens: manual;
495 | -webkit-hyphens: manual;
496 | hyphens: manual;
497 | }
498 |
499 | /* -- hlist styles ---------------------------------------------------------- */
500 |
501 | table.hlist {
502 | margin: 1em 0;
503 | }
504 |
505 | table.hlist td {
506 | vertical-align: top;
507 | }
508 |
509 | /* -- object description styles --------------------------------------------- */
510 |
511 | .sig {
512 | font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
513 | }
514 |
515 | .sig-name, code.descname {
516 | background-color: transparent;
517 | font-weight: bold;
518 | }
519 |
520 | .sig-name {
521 | font-size: 1.1em;
522 | }
523 |
524 | code.descname {
525 | font-size: 1.2em;
526 | }
527 |
528 | .sig-prename, code.descclassname {
529 | background-color: transparent;
530 | }
531 |
532 | .optional {
533 | font-size: 1.3em;
534 | }
535 |
536 | .sig-paren {
537 | font-size: larger;
538 | }
539 |
540 | .sig-param.n {
541 | font-style: italic;
542 | }
543 |
544 | /* C++ specific styling */
545 |
546 | .sig-inline.c-texpr,
547 | .sig-inline.cpp-texpr {
548 | font-family: unset;
549 | }
550 |
551 | .sig.c .k, .sig.c .kt,
552 | .sig.cpp .k, .sig.cpp .kt {
553 | color: #0033B3;
554 | }
555 |
556 | .sig.c .m,
557 | .sig.cpp .m {
558 | color: #1750EB;
559 | }
560 |
561 | .sig.c .s, .sig.c .sc,
562 | .sig.cpp .s, .sig.cpp .sc {
563 | color: #067D17;
564 | }
565 |
566 |
567 | /* -- other body styles ----------------------------------------------------- */
568 |
569 | ol.arabic {
570 | list-style: decimal;
571 | }
572 |
573 | ol.loweralpha {
574 | list-style: lower-alpha;
575 | }
576 |
577 | ol.upperalpha {
578 | list-style: upper-alpha;
579 | }
580 |
581 | ol.lowerroman {
582 | list-style: lower-roman;
583 | }
584 |
585 | ol.upperroman {
586 | list-style: upper-roman;
587 | }
588 |
589 | :not(li) > ol > li:first-child > :first-child,
590 | :not(li) > ul > li:first-child > :first-child {
591 | margin-top: 0px;
592 | }
593 |
594 | :not(li) > ol > li:last-child > :last-child,
595 | :not(li) > ul > li:last-child > :last-child {
596 | margin-bottom: 0px;
597 | }
598 |
599 | ol.simple ol p,
600 | ol.simple ul p,
601 | ul.simple ol p,
602 | ul.simple ul p {
603 | margin-top: 0;
604 | }
605 |
606 | ol.simple > li:not(:first-child) > p,
607 | ul.simple > li:not(:first-child) > p {
608 | margin-top: 0;
609 | }
610 |
611 | ol.simple p,
612 | ul.simple p {
613 | margin-bottom: 0;
614 | }
615 |
616 | aside.footnote > span,
617 | div.citation > span {
618 | float: left;
619 | }
620 | aside.footnote > span:last-of-type,
621 | div.citation > span:last-of-type {
622 | padding-right: 0.5em;
623 | }
624 | aside.footnote > p {
625 | margin-left: 2em;
626 | }
627 | div.citation > p {
628 | margin-left: 4em;
629 | }
630 | aside.footnote > p:last-of-type,
631 | div.citation > p:last-of-type {
632 | margin-bottom: 0em;
633 | }
634 | aside.footnote > p:last-of-type:after,
635 | div.citation > p:last-of-type:after {
636 | content: "";
637 | clear: both;
638 | }
639 |
640 | dl.field-list {
641 | display: grid;
642 | grid-template-columns: fit-content(30%) auto;
643 | }
644 |
645 | dl.field-list > dt {
646 | font-weight: bold;
647 | word-break: break-word;
648 | padding-left: 0.5em;
649 | padding-right: 5px;
650 | }
651 |
652 | dl.field-list > dd {
653 | padding-left: 0.5em;
654 | margin-top: 0em;
655 | margin-left: 0em;
656 | margin-bottom: 0em;
657 | }
658 |
659 | dl {
660 | margin-bottom: 15px;
661 | }
662 |
663 | dd > :first-child {
664 | margin-top: 0px;
665 | }
666 |
667 | dd ul, dd table {
668 | margin-bottom: 10px;
669 | }
670 |
671 | dd {
672 | margin-top: 3px;
673 | margin-bottom: 10px;
674 | margin-left: 30px;
675 | }
676 |
677 | .sig dd {
678 | margin-top: 0px;
679 | margin-bottom: 0px;
680 | }
681 |
682 | .sig dl {
683 | margin-top: 0px;
684 | margin-bottom: 0px;
685 | }
686 |
687 | dl > dd:last-child,
688 | dl > dd:last-child > :last-child {
689 | margin-bottom: 0;
690 | }
691 |
692 | dt:target, span.highlighted {
693 | background-color: #fbe54e;
694 | }
695 |
696 | rect.highlighted {
697 | fill: #fbe54e;
698 | }
699 |
700 | dl.glossary dt {
701 | font-weight: bold;
702 | font-size: 1.1em;
703 | }
704 |
705 | .versionmodified {
706 | font-style: italic;
707 | }
708 |
709 | .system-message {
710 | background-color: #fda;
711 | padding: 5px;
712 | border: 3px solid red;
713 | }
714 |
715 | .footnote:target {
716 | background-color: #ffa;
717 | }
718 |
719 | .line-block {
720 | display: block;
721 | margin-top: 1em;
722 | margin-bottom: 1em;
723 | }
724 |
725 | .line-block .line-block {
726 | margin-top: 0;
727 | margin-bottom: 0;
728 | margin-left: 1.5em;
729 | }
730 |
731 | .guilabel, .menuselection {
732 | font-family: sans-serif;
733 | }
734 |
735 | .accelerator {
736 | text-decoration: underline;
737 | }
738 |
739 | .classifier {
740 | font-style: oblique;
741 | }
742 |
743 | .classifier:before {
744 | font-style: normal;
745 | margin: 0 0.5em;
746 | content: ":";
747 | display: inline-block;
748 | }
749 |
750 | abbr, acronym {
751 | border-bottom: dotted 1px;
752 | cursor: help;
753 | }
754 |
755 | .translated {
756 | background-color: rgba(207, 255, 207, 0.2)
757 | }
758 |
759 | .untranslated {
760 | background-color: rgba(255, 207, 207, 0.2)
761 | }
762 |
763 | /* -- code displays --------------------------------------------------------- */
764 |
765 | pre {
766 | overflow: auto;
767 | overflow-y: hidden; /* fixes display issues on Chrome browsers */
768 | }
769 |
770 | pre, div[class*="highlight-"] {
771 | clear: both;
772 | }
773 |
774 | span.pre {
775 | -moz-hyphens: none;
776 | -ms-hyphens: none;
777 | -webkit-hyphens: none;
778 | hyphens: none;
779 | white-space: nowrap;
780 | }
781 |
782 | div[class*="highlight-"] {
783 | margin: 1em 0;
784 | }
785 |
786 | td.linenos pre {
787 | border: 0;
788 | background-color: transparent;
789 | color: #aaa;
790 | }
791 |
792 | table.highlighttable {
793 | display: block;
794 | }
795 |
796 | table.highlighttable tbody {
797 | display: block;
798 | }
799 |
800 | table.highlighttable tr {
801 | display: flex;
802 | }
803 |
804 | table.highlighttable td {
805 | margin: 0;
806 | padding: 0;
807 | }
808 |
809 | table.highlighttable td.linenos {
810 | padding-right: 0.5em;
811 | }
812 |
813 | table.highlighttable td.code {
814 | flex: 1;
815 | overflow: hidden;
816 | }
817 |
818 | .highlight .hll {
819 | display: block;
820 | }
821 |
822 | div.highlight pre,
823 | table.highlighttable pre {
824 | margin: 0;
825 | }
826 |
827 | div.code-block-caption + div {
828 | margin-top: 0;
829 | }
830 |
831 | div.code-block-caption {
832 | margin-top: 1em;
833 | padding: 2px 5px;
834 | font-size: small;
835 | }
836 |
837 | div.code-block-caption code {
838 | background-color: transparent;
839 | }
840 |
841 | table.highlighttable td.linenos,
842 | span.linenos,
843 | div.highlight span.gp { /* gp: Generic.Prompt */
844 | user-select: none;
845 | -webkit-user-select: text; /* Safari fallback only */
846 | -webkit-user-select: none; /* Chrome/Safari */
847 | -moz-user-select: none; /* Firefox */
848 | -ms-user-select: none; /* IE10+ */
849 | }
850 |
851 | div.code-block-caption span.caption-number {
852 | padding: 0.1em 0.3em;
853 | font-style: italic;
854 | }
855 |
856 | div.code-block-caption span.caption-text {
857 | }
858 |
859 | div.literal-block-wrapper {
860 | margin: 1em 0;
861 | }
862 |
863 | code.xref, a code {
864 | background-color: transparent;
865 | font-weight: bold;
866 | }
867 |
868 | h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
869 | background-color: transparent;
870 | }
871 |
872 | .viewcode-link {
873 | float: right;
874 | }
875 |
876 | .viewcode-back {
877 | float: right;
878 | font-family: sans-serif;
879 | }
880 |
881 | div.viewcode-block:target {
882 | margin: -1px -10px;
883 | padding: 0 10px;
884 | }
885 |
886 | /* -- math display ---------------------------------------------------------- */
887 |
888 | img.math {
889 | vertical-align: middle;
890 | }
891 |
892 | div.body div.math p {
893 | text-align: center;
894 | }
895 |
896 | span.eqno {
897 | float: right;
898 | }
899 |
900 | span.eqno a.headerlink {
901 | position: absolute;
902 | z-index: 1;
903 | }
904 |
905 | div.math:hover a.headerlink {
906 | visibility: visible;
907 | }
908 |
909 | /* -- printout stylesheet --------------------------------------------------- */
910 |
911 | @media print {
912 | div.document,
913 | div.documentwrapper,
914 | div.bodywrapper {
915 | margin: 0 !important;
916 | width: 100%;
917 | }
918 |
919 | div.sphinxsidebar,
920 | div.related,
921 | div.footer,
922 | #top-link {
923 | display: none;
924 | }
925 | }
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/models/sites.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.models.sites — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../../../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.models.sites</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.models.sites</h1><div class="highlight"><pre>
77 | <span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
78 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
79 | <span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
80 | <span class="kn">from</span> <span class="nn">enum</span> <span class="kn">import</span> <span class="n">Enum</span>
81 |
82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.base</span> <span class="kn">import</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span>
83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils</span> <span class="kn">import</span> <span class="n">to_isoformat_zulu</span>
84 |
85 | <div class="viewcode-block" id="SiteType">
86 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteType">[docs]</a>
87 | <span class="k">class</span> <span class="nc">SiteType</span><span class="p">(</span><span class="n">Enum</span><span class="p">):</span>
88 | <span class="n">UNDEFINED</span> <span class="o">=</span> <span class="s2">"undefined"</span>
89 | <span class="n">CRAWLED_URL</span> <span class="o">=</span> <span class="s2">"url"</span>
90 | <span class="n">CRAWLED_LIST</span> <span class="o">=</span> <span class="s2">"list"</span></div>
91 |
92 |
93 | <span class="n">SITES_TOOL_NAME</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"webcrawl_sites"</span>
94 | <span class="n">SITES_FIELDS_BASE</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"name"</span><span class="p">,</span> <span class="s2">"type"</span><span class="p">,</span> <span class="s2">"urls"</span><span class="p">]</span>
95 | <span class="n">SITES_FIELDS_DEFAULT</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">SITES_FIELDS_BASE</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"created"</span><span class="p">,</span> <span class="s2">"modified"</span><span class="p">]</span>
96 |
97 | <div class="viewcode-block" id="SiteResult">
98 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult">[docs]</a>
99 | <span class="k">class</span> <span class="nc">SiteResult</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
100 | <span class="w"> </span><span class="sd">"""</span>
101 | <span class="sd"> Represents a website or crawl directory result.</span>
102 | <span class="sd"> """</span>
103 |
104 | <div class="viewcode-block" id="SiteResult.__init__">
105 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult.__init__">[docs]</a>
106 | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
107 | <span class="bp">self</span><span class="p">,</span>
108 | <span class="nb">id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
109 | <span class="n">name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
110 | <span class="nb">type</span><span class="p">:</span> <span class="n">SiteType</span> <span class="o">=</span> <span class="n">SiteType</span><span class="o">.</span><span class="n">CRAWLED_URL</span><span class="p">,</span>
111 | <span class="n">urls</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
112 | <span class="n">path</span><span class="p">:</span> <span class="n">Path</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
113 | <span class="n">created</span><span class="p">:</span> <span class="n">datetime</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
114 | <span class="n">modified</span><span class="p">:</span> <span class="n">datetime</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
115 | <span class="n">robots</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
116 | <span class="n">metadata</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
117 | <span class="p">):</span>
118 | <span class="w"> </span><span class="sd">"""</span>
119 | <span class="sd"> Initialize a SiteResult instance.</span>
120 |
121 | <span class="sd"> Args:</span>
122 | <span class="sd"> id: site identifier</span>
123 | <span class="sd"> name: site name, either a URL or a custom job</span>
124 | <span class="sd"> urls: site URL(s), multiple for list type crawls</span>
125 | <span class="sd"> path: path to site data, different from datasrc</span>
126 | <span class="sd"> created: creation timestamp</span>
127 | <span class="sd"> modified: last modification timestamp</span>
128 | <span class="sd"> robots: robots.txt content</span>
129 | <span class="sd"> metadata: additional metadata for the site</span>
130 | <span class="sd"> """</span>
131 | <span class="bp">self</span><span class="o">.</span><span class="n">id</span> <span class="o">=</span> <span class="nb">id</span>
132 | <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
133 | <span class="bp">self</span><span class="o">.</span><span class="n">type</span> <span class="o">=</span> <span class="nb">type</span>
134 | <span class="bp">self</span><span class="o">.</span><span class="n">urls</span> <span class="o">=</span> <span class="n">urls</span>
135 | <span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
136 | <span class="bp">self</span><span class="o">.</span><span class="n">created</span> <span class="o">=</span> <span class="n">created</span>
137 | <span class="bp">self</span><span class="o">.</span><span class="n">modified</span> <span class="o">=</span> <span class="n">modified</span>
138 | <span class="bp">self</span><span class="o">.</span><span class="n">robots</span> <span class="o">=</span> <span class="n">robots</span>
139 | <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="o">=</span> <span class="n">metadata</span> <span class="ow">or</span> <span class="p">{}</span></div>
140 |
141 |
142 | <div class="viewcode-block" id="SiteResult.to_dict">
143 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult.to_dict">[docs]</a>
144 | <span class="k">def</span> <span class="nf">to_dict</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]:</span>
145 | <span class="w"> </span><span class="sd">"""</span>
146 | <span class="sd"> Convert the object to a dictionary suitable for JSON serialization.</span>
147 | <span class="sd"> """</span>
148 | <span class="n">result</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
149 | <span class="s2">"id"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
150 | <span class="s2">"name"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
151 | <span class="s2">"type"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">value</span><span class="p">,</span>
152 | <span class="s2">"urls"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">urls</span><span class="p">,</span>
153 | <span class="s2">"created"</span><span class="p">:</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">created</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">created</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
154 | <span class="s2">"modified"</span><span class="p">:</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">modified</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">modified</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
155 | <span class="s2">"metadata"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
156 | <span class="p">}</span>
157 |
158 | <span class="k">return</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">v</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="p">(</span><span class="n">k</span> <span class="o">==</span> <span class="s2">"metadata"</span> <span class="ow">and</span> <span class="n">v</span> <span class="o">==</span> <span class="p">{})}</span></div>
159 | </div>
160 |
161 | </pre></div>
162 |
163 | </div>
164 | </div>
165 | <footer>
166 |
167 | <hr/>
168 |
169 | <div role="contentinfo">
170 | <p>© Copyright 2025, pragmar.</p>
171 | </div>
172 |
173 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
174 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
175 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
176 |
177 |
178 | </footer>
179 | </div>
180 | </div>
181 | </section>
182 | </div>
183 | <script>
184 | jQuery(function () {
185 | SphinxRtdTheme.Navigation.enable(true);
186 | });
187 | </script>
188 |
189 | </body>
190 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/warc/tests.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.warc.tests — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.warc.tests</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.crawlers.warc.tests</h1><div class="highlight"><pre>
77 | <span></span><span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.crawler</span> <span class="kn">import</span> <span class="n">WarcCrawler</span>
78 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.adapter</span> <span class="kn">import</span> <span class="n">WarcManager</span>
79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
82 |
83 | <span class="n">EXAMPLE_WARC_ID</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">WarcManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"example.warc.gz"</span><span class="p">)</span>
84 | <span class="n">PRAGMAR_WARC_ID</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">WarcManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"pragmar.warc.gz"</span><span class="p">)</span>
85 |
86 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
87 |
88 | <div class="viewcode-block" id="WarcTests">
89 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests">[docs]</a>
90 | <span class="k">class</span> <span class="nc">WarcTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
91 | <span class="w"> </span><span class="sd">"""</span>
92 | <span class="sd"> Test suite for the WARC crawler implementation.</span>
93 | <span class="sd"> Uses all wrapped test methods from BaseCrawlerTests.</span>
94 | <span class="sd"> """</span>
95 |
96 | <div class="viewcode-block" id="WarcTests.setUp">
97 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.setUp">[docs]</a>
98 | <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
99 | <span class="w"> </span><span class="sd">"""</span>
100 | <span class="sd"> Set up the test environment with fixture data.</span>
101 | <span class="sd"> """</span>
102 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
103 | <span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"warc"</span></div>
104 |
105 |
106 | <div class="viewcode-block" id="WarcTests.test_warc_pulse">
107 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_pulse">[docs]</a>
108 | <span class="k">def</span> <span class="nf">test_warc_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
109 | <span class="w"> </span><span class="sd">"""</span>
110 | <span class="sd"> Test basic crawler initialization.</span>
111 | <span class="sd"> """</span>
112 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
113 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
114 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
115 |
116 |
117 | <div class="viewcode-block" id="WarcTests.test_warc_sites">
118 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sites">[docs]</a>
119 | <span class="k">def</span> <span class="nf">test_warc_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
120 | <span class="w"> </span><span class="sd">"""</span>
121 | <span class="sd"> Test site retrieval API functionality.</span>
122 | <span class="sd"> """</span>
123 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
124 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
125 |
126 |
127 | <div class="viewcode-block" id="WarcTests.test_warc_search">
128 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_search">[docs]</a>
129 | <span class="k">def</span> <span class="nf">test_warc_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
130 | <span class="w"> </span><span class="sd">"""</span>
131 | <span class="sd"> Test boolean search functionality</span>
132 | <span class="sd"> """</span>
133 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
134 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
135 |
136 |
137 | <div class="viewcode-block" id="WarcTests.test_warc_resources">
138 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_resources">[docs]</a>
139 | <span class="k">def</span> <span class="nf">test_warc_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
140 | <span class="w"> </span><span class="sd">"""</span>
141 | <span class="sd"> Test resource retrieval API functionality with various parameters.</span>
142 | <span class="sd"> """</span>
143 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
144 | <span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="n">EXAMPLE_WARC_ID</span><span class="p">)</span></div>
145 |
146 |
147 | <span class="c1"># pragmar WARC fixture legit contains no images</span>
148 | <span class="c1"># may be default behavior of wget WARC gen, not sure</span>
149 | <span class="c1"># this is a blind spot</span>
150 | <span class="c1"># def test_interrobot_images(self):</span>
151 | <span class="c1"># """</span>
152 | <span class="c1"># Test InterroBot-specific image handling and thumbnails.</span>
153 | <span class="c1"># """</span>
154 | <span class="c1"># crawler = WarcCrawler(self._datasrc)</span>
155 | <span class="c1"># self.run_pragmar_image_tests(crawler, PRAGMAR_WARC_ID)</span>
156 |
157 | <div class="viewcode-block" id="WarcTests.test_warc_sorts">
158 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sorts">[docs]</a>
159 | <span class="k">def</span> <span class="nf">test_warc_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
160 | <span class="w"> </span><span class="sd">"""</span>
161 | <span class="sd"> Test random sort functionality using the '?' sort parameter.</span>
162 | <span class="sd"> """</span>
163 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
164 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
165 |
166 |
167 | <div class="viewcode-block" id="WarcTests.test_warc_content_parsing">
168 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_content_parsing">[docs]</a>
169 | <span class="k">def</span> <span class="nf">test_warc_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
170 | <span class="w"> </span><span class="sd">"""</span>
171 | <span class="sd"> Test content type detection and parsing for WARC files.</span>
172 | <span class="sd"> """</span>
173 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
174 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div>
175 |
176 |
177 | <div class="viewcode-block" id="WarcTests.test_report">
178 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_report">[docs]</a>
179 | <span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
180 | <span class="w"> </span><span class="sd">"""</span>
181 | <span class="sd"> Run test report, save to data directory.</span>
182 | <span class="sd"> """</span>
183 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
184 | <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="s2">"WARC"</span><span class="p">))</span></div>
185 | </div>
186 |
187 | </pre></div>
188 |
189 | </div>
190 | </div>
191 | <footer>
192 |
193 | <hr/>
194 |
195 | <div role="contentinfo">
196 | <p>© Copyright 2025, pragmar.</p>
197 | </div>
198 |
199 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
200 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
201 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
202 |
203 |
204 | </footer>
205 | </div>
206 | </div>
207 | </section>
208 | </div>
209 | <script>
210 | jQuery(function () {
211 | SphinxRtdTheme.Navigation.enable(true);
212 | });
213 | </script>
214 |
215 | </body>
216 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/logger.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.utils.logger — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../../../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
67 | <li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
68 | <li class="breadcrumb-item active">mcp_server_webcrawl.utils.logger</li>
69 | <li class="wy-breadcrumbs-aside">
70 | </li>
71 | </ul>
72 | <hr/>
73 | </div>
74 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
75 | <div itemprop="articleBody">
76 |
77 | <h1>Source code for mcp_server_webcrawl.utils.logger</h1><div class="highlight"><pre>
78 | <span></span><span class="kn">import</span> <span class="nn">logging</span>
79 | <span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
80 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
81 |
82 | <span class="kn">import</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="k">as</span> <span class="nn">settings</span>
83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="kn">import</span> <span class="n">DEBUG</span><span class="p">,</span> <span class="n">DATA_DIRECTORY</span>
84 |
85 | <span class="n">DEFAULT_LOG_KEY</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"mcp-server-webcrawl"</span>
86 | <span class="n">DEFAULT_LOG_PATH</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="n">DATA_DIRECTORY</span> <span class="o">/</span> <span class="s2">"mcp-server-webcrawl.log"</span>
87 | <span class="n">DEFAULT_LOG_LEVEL</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">WARNING</span>
88 |
89 | <div class="viewcode-block" id="get_logger_configuration">
90 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.get_logger_configuration">[docs]</a>
91 | <span class="k">def</span> <span class="nf">get_logger_configuration</span><span class="p">()</span> <span class="o">-></span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span>
92 | <span class="w"> </span><span class="sd">"""</span>
93 | <span class="sd"> Get log name, path, and level (in that order)</span>
94 |
95 | <span class="sd"> Returns:</span>
96 | <span class="sd"> tuple[str, Path, int]: A tuple containing name, path, and level</span>
97 | <span class="sd"> """</span>
98 |
99 | <span class="n">log_level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">DEBUG</span> <span class="k">if</span> <span class="n">DEBUG</span> <span class="k">else</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">settings</span><span class="p">,</span> <span class="s2">"LOG_LEVEL"</span><span class="p">,</span> <span class="n">DEFAULT_LOG_LEVEL</span><span class="p">)</span>
100 | <span class="n">log_path</span><span class="p">:</span> <span class="n">Path</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">settings</span><span class="p">,</span> <span class="s2">"LOG_PATH"</span><span class="p">,</span> <span class="n">DEFAULT_LOG_PATH</span><span class="p">)</span>
101 |
102 | <span class="k">return</span> <span class="p">(</span><span class="n">DEFAULT_LOG_KEY</span><span class="p">,</span> <span class="n">log_path</span><span class="p">,</span> <span class="n">log_level</span><span class="p">)</span></div>
103 |
104 |
105 | <div class="viewcode-block" id="get_logger">
106 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.get_logger">[docs]</a>
107 | <span class="k">def</span> <span class="nf">get_logger</span><span class="p">()</span> <span class="o">-></span> <span class="n">logging</span><span class="o">.</span><span class="n">Logger</span><span class="p">:</span>
108 | <span class="w"> </span><span class="sd">"""</span>
109 | <span class="sd"> Get logger, usually in order to write to it</span>
110 |
111 | <span class="sd"> Returns:</span>
112 | <span class="sd"> Logger: a writable logging object (error/warn/info/debug)</span>
113 | <span class="sd"> """</span>
114 |
115 | <span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">)</span> <span class="o">=</span> <span class="n">get_logger_configuration</span><span class="p">()</span>
116 | <span class="k">return</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">log_name</span><span class="p">)</span></div>
117 |
118 |
119 | <div class="viewcode-block" id="initialize_logger">
120 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.initialize_logger">[docs]</a>
121 | <span class="k">def</span> <span class="nf">initialize_logger</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
122 | <span class="w"> </span><span class="sd">"""</span>
123 | <span class="sd"> Validate and set up logger for writing</span>
124 |
125 | <span class="sd"> Returns:</span>
126 | <span class="sd"> None</span>
127 | <span class="sd"> """</span>
128 |
129 | <span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="n">log_path</span><span class="p">,</span> <span class="n">log_level</span><span class="p">)</span> <span class="o">=</span> <span class="n">get_logger_configuration</span><span class="p">()</span>
130 | <span class="k">if</span> <span class="n">log_level</span> <span class="o">==</span> <span class="n">logging</span><span class="o">.</span><span class="n">NOTSET</span><span class="p">:</span>
131 | <span class="c1"># don't set up anything, named logging will effectively evaporate</span>
132 | <span class="k">return</span>
133 |
134 | <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_level</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">and</span> <span class="n">log_level</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"LOG_LEVEL must be set"</span>
135 | <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_path</span><span class="p">,</span> <span class="n">Path</span><span class="p">),</span> <span class="s2">"LOG_PATH must be a Path object"</span>
136 | <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="n">log_name</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="o">!=</span> <span class="s2">""</span><span class="p">,</span> <span class="s2">"LOG_NAME must be a non-empty string"</span>
137 | <span class="k">assert</span> <span class="nb">all</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">isalpha</span><span class="p">()</span> <span class="ow">or</span> <span class="n">c</span> <span class="ow">in</span> <span class="s2">"-_"</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">log_name</span><span class="p">),</span> <span class="s2">"LOG_NAME must contain only A-Z, a-z, hyphens, and underscores"</span>
138 |
139 | <span class="c1"># handle custom log paths differently, don't generate directories</span>
140 | <span class="k">if</span> <span class="s2">".mcp_server_webcrawl"</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">log_path</span><span class="p">):</span>
141 | <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
142 | <span class="k">else</span><span class="p">:</span>
143 | <span class="k">assert</span> <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">exists</span><span class="p">()</span> <span class="ow">and</span> <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> \
144 | <span class="sa">f</span><span class="s2">"Custom parent directory `</span><span class="si">{</span><span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="si">}</span><span class="s2">` does not exist or is not a directory"</span>
145 |
146 | <span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">filename</span><span class="o">=</span><span class="nb">str</span><span class="p">(</span><span class="n">log_path</span><span class="p">),</span> <span class="n">filemode</span><span class="o">=</span><span class="s2">"w"</span><span class="p">,</span> <span class="n">level</span><span class="o">=</span><span class="n">log_level</span><span class="p">,</span>
147 | <span class="nb">format</span><span class="o">=</span><span class="s2">"</span><span class="si">%(asctime)s</span><span class="s2"> - </span><span class="si">%(name)s</span><span class="s2"> - </span><span class="si">%(levelname)s</span><span class="s2"> - </span><span class="si">%(message)s</span><span class="s2">"</span><span class="p">,</span>
148 | <span class="n">datefmt</span><span class="o">=</span><span class="s2">"%Y-%m-</span><span class="si">%d</span><span class="s2"> %H:%M:%S"</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">"utf-8"</span><span class="p">)</span>
149 |
150 | <span class="n">logger</span><span class="p">:</span> <span class="n">logging</span><span class="o">.</span><span class="n">Logger</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">log_name</span><span class="p">)</span>
151 |
152 | <span class="c1"># just set a few ops back, concurrent logger might not be ready</span>
153 | <span class="k">if</span> <span class="n">log_level</span> <span class="o"><=</span> <span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">:</span>
154 | <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"🖥️ starting webcrawl MCP server"</span><span class="p">)</span>
155 | <span class="n">log_extra</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"(Debug is True)"</span> <span class="k">if</span> <span class="n">DEBUG</span> <span class="k">else</span> <span class="s2">""</span>
156 | <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">"log level set to </span><span class="si">{</span><span class="n">log_level</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">log_extra</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span></div>
157 |
158 | </pre></div>
159 |
160 | </div>
161 | </div>
162 | <footer>
163 |
164 | <hr/>
165 |
166 | <div role="contentinfo">
167 | <p>© Copyright 2025, pragmar.</p>
168 | </div>
169 |
170 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
171 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
172 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
173 |
174 |
175 | </footer>
176 | </div>
177 | </div>
178 | </section>
179 | </div>
180 | <script>
181 | jQuery(function () {
182 | SphinxRtdTheme.Navigation.enable(true);
183 | });
184 | </script>
185 |
186 | </body>
187 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/querycache.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.utils.querycache — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
48 | </ul>
49 |
50 | </div>
51 | </div>
52 | </nav>
53 |
54 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
55 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
56 | <a href="../../../index.html">mcp-server-webcrawl</a>
57 | </nav>
58 |
59 | <div class="wy-nav-content">
60 | <div class="rst-content">
61 | <div role="navigation" aria-label="Page navigation">
62 | <ul class="wy-breadcrumbs">
63 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
64 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
65 | <li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
66 | <li class="breadcrumb-item active">mcp_server_webcrawl.utils.querycache</li>
67 | <li class="wy-breadcrumbs-aside">
68 | </li>
69 | </ul>
70 | <hr/>
71 | </div>
72 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
73 | <div itemprop="articleBody">
74 |
75 | <h1>Source code for mcp_server_webcrawl.utils.querycache</h1><div class="highlight"><pre>
76 | <span></span><span class="kn">import</span> <span class="nn">hashlib</span>
77 | <span class="kn">import</span> <span class="nn">time</span>
78 |
79 | <div class="viewcode-block" id="QueryCountCache">
80 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache">[docs]</a>
81 | <span class="k">class</span> <span class="nc">QueryCountCache</span><span class="p">:</span>
82 | <span class="w"> </span><span class="sd">"""</span>
83 | <span class="sd"> A cache for storing total count results from database queries.</span>
84 | <span class="sd"> Only caches the count integer values, as these are reusable and light.</span>
85 | <span class="sd"> """</span>
86 | <div class="viewcode-block" id="QueryCountCache.__init__">
87 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.__init__">[docs]</a>
88 | <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">max</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">250</span><span class="p">,</span> <span class="n">ttl</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">900</span><span class="p">):</span>
89 | <span class="w"> </span><span class="sd">"""</span>
90 | <span class="sd"> Initialize the query count cache.</span>
91 |
92 | <span class="sd"> Parameters:</span>
93 | <span class="sd"> max: Maximum number of entries to store in the cache</span>
94 | <span class="sd"> ttl: Time-to-live for cache entries in seconds</span>
95 | <span class="sd"> """</span>
96 | <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
97 | <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
98 | <span class="bp">self</span><span class="o">.</span><span class="n">_max</span> <span class="o">=</span> <span class="nb">max</span>
99 | <span class="bp">self</span><span class="o">.</span><span class="n">_ttl</span> <span class="o">=</span> <span class="n">ttl</span></div>
100 |
101 |
102 | <span class="k">def</span> <span class="nf">_hash_query</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
103 | <span class="w"> </span><span class="sd">"""</span>
104 | <span class="sd"> Generate a hash key from a query statement and parameters.</span>
105 |
106 | <span class="sd"> Parameters:</span>
107 | <span class="sd"> statement: SQL statement</span>
108 | <span class="sd"> params: Query parameters</span>
109 |
110 | <span class="sd"> Returns:</span>
111 | <span class="sd"> MD5 hash of the combined query string</span>
112 | <span class="sd"> """</span>
113 | <span class="n">query</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">statement</span><span class="si">}</span><span class="s2">:</span><span class="si">{</span><span class="nb">str</span><span class="p">(</span><span class="n">params</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
114 | <span class="k">return</span> <span class="n">hashlib</span><span class="o">.</span><span class="n">md5</span><span class="p">(</span><span class="n">query</span><span class="o">.</span><span class="n">encode</span><span class="p">())</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()</span>
115 |
116 | <div class="viewcode-block" id="QueryCountCache.get">
117 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.get">[docs]</a>
118 | <span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="nb">int</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
119 | <span class="w"> </span><span class="sd">"""</span>
120 | <span class="sd"> Get a cached count result if available and not expired.</span>
121 |
122 | <span class="sd"> Parameters:</span>
123 | <span class="sd"> statement: SQL statement</span>
124 | <span class="sd"> params: Query parameters</span>
125 |
126 | <span class="sd"> Returns:</span>
127 | <span class="sd"> Cached count value or None if not found or expired</span>
128 | <span class="sd"> """</span>
129 | <span class="n">key</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hash_query</span><span class="p">(</span><span class="n">statement</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
130 | <span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span>
131 | <span class="k">if</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">_ttl</span><span class="p">:</span>
132 | <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
133 | <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
134 | <span class="k">return</span> <span class="kc">None</span>
135 | <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
136 | <span class="k">return</span> <span class="kc">None</span></div>
137 |
138 |
139 | <div class="viewcode-block" id="QueryCountCache.set">
140 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.set">[docs]</a>
141 | <span class="k">def</span> <span class="nf">set</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">],</span> <span class="n">count</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
142 | <span class="w"> </span><span class="sd">"""</span>
143 | <span class="sd"> Store a count result in the cache.</span>
144 |
145 | <span class="sd"> Parameters:</span>
146 | <span class="sd"> statement: SQL statement</span>
147 | <span class="sd"> params: Query parameters</span>
148 | <span class="sd"> count: Count value to cache</span>
149 | <span class="sd"> """</span>
150 | <span class="n">key</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hash_query</span><span class="p">(</span><span class="n">statement</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
151 |
152 | <span class="c1"># if cache is full, remove oldest entry</span>
153 | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">)</span> <span class="o">>=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_max</span> <span class="ow">and</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span>
154 | <span class="n">oldest_key</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">k</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">k</span><span class="p">])</span>
155 | <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">oldest_key</span><span class="p">]</span>
156 | <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">oldest_key</span><span class="p">]</span>
157 |
158 | <span class="c1"># store new entry</span>
159 | <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">count</span>
160 | <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span></div>
161 |
162 |
163 | <div class="viewcode-block" id="QueryCountCache.clear">
164 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.clear">[docs]</a>
165 | <span class="k">def</span> <span class="nf">clear</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
166 | <span class="w"> </span><span class="sd">"""</span>
167 | <span class="sd"> Clear all entries from the cache.</span>
168 | <span class="sd"> """</span>
169 | <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
170 | <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span></div>
171 | </div>
172 |
173 | </pre></div>
174 |
175 | </div>
176 | </div>
177 | <footer>
178 |
179 | <hr/>
180 |
181 | <div role="contentinfo">
182 | <p>© Copyright 2025, pragmar.</p>
183 | </div>
184 |
185 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
186 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
187 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
188 |
189 |
190 | </footer>
191 | </div>
192 | </div>
193 | </section>
194 | </div>
195 | <script>
196 | jQuery(function () {
197 | SphinxRtdTheme.Navigation.enable(true);
198 | });
199 | </script>
200 |
201 | </body>
202 | </html>
```