#
tokens: 49903/50000 9/216 files (page 6/35)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 6 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│   ├── _images
│   │   ├── interactive.document.webp
│   │   ├── interactive.search.webp
│   │   └── mcpswc.svg
│   ├── _modules
│   │   ├── index.html
│   │   ├── mcp_server_webcrawl
│   │   │   ├── crawlers
│   │   │   │   ├── archivebox
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── base
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── api.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   ├── indexed.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── httrack
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── interrobot
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── katana
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── siteone
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── warc
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   └── wget
│   │   │   │       ├── adapter.html
│   │   │   │       ├── crawler.html
│   │   │   │       └── tests.html
│   │   │   ├── crawlers.html
│   │   │   ├── extras
│   │   │   │   ├── markdown.html
│   │   │   │   ├── regex.html
│   │   │   │   ├── snippets.html
│   │   │   │   ├── thumbnails.html
│   │   │   │   └── xpath.html
│   │   │   ├── interactive
│   │   │   │   ├── highlights.html
│   │   │   │   ├── search.html
│   │   │   │   ├── session.html
│   │   │   │   └── ui.html
│   │   │   ├── main.html
│   │   │   ├── models
│   │   │   │   ├── resources.html
│   │   │   │   └── sites.html
│   │   │   ├── templates
│   │   │   │   └── tests.html
│   │   │   ├── utils
│   │   │   │   ├── blobs.html
│   │   │   │   ├── cli.html
│   │   │   │   ├── logger.html
│   │   │   │   ├── querycache.html
│   │   │   │   ├── server.html
│   │   │   │   └── tools.html
│   │   │   └── utils.html
│   │   └── re.html
│   ├── _sources
│   │   ├── guides
│   │   │   ├── archivebox.rst.txt
│   │   │   ├── httrack.rst.txt
│   │   │   ├── interrobot.rst.txt
│   │   │   ├── katana.rst.txt
│   │   │   ├── siteone.rst.txt
│   │   │   ├── warc.rst.txt
│   │   │   └── wget.rst.txt
│   │   ├── guides.rst.txt
│   │   ├── index.rst.txt
│   │   ├── installation.rst.txt
│   │   ├── interactive.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.base.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│   │   ├── mcp_server_webcrawl.extras.rst.txt
│   │   ├── mcp_server_webcrawl.interactive.rst.txt
│   │   ├── mcp_server_webcrawl.models.rst.txt
│   │   ├── mcp_server_webcrawl.rst.txt
│   │   ├── mcp_server_webcrawl.templates.rst.txt
│   │   ├── mcp_server_webcrawl.utils.rst.txt
│   │   ├── modules.rst.txt
│   │   ├── prompts.rst.txt
│   │   └── usage.rst.txt
│   ├── _static
│   │   ├── _sphinx_javascript_frameworks_compat.js
│   │   ├── basic.css
│   │   ├── css
│   │   │   ├── badge_only.css
│   │   │   ├── fonts
│   │   │   │   ├── fontawesome-webfont.eot
│   │   │   │   ├── fontawesome-webfont.svg
│   │   │   │   ├── fontawesome-webfont.ttf
│   │   │   │   ├── fontawesome-webfont.woff
│   │   │   │   ├── fontawesome-webfont.woff2
│   │   │   │   ├── lato-bold-italic.woff
│   │   │   │   ├── lato-bold-italic.woff2
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-normal-italic.woff
│   │   │   │   ├── lato-normal-italic.woff2
│   │   │   │   ├── lato-normal.woff
│   │   │   │   ├── lato-normal.woff2
│   │   │   │   ├── Roboto-Slab-Bold.woff
│   │   │   │   ├── Roboto-Slab-Bold.woff2
│   │   │   │   ├── Roboto-Slab-Regular.woff
│   │   │   │   └── Roboto-Slab-Regular.woff2
│   │   │   └── theme.css
│   │   ├── doctools.js
│   │   ├── documentation_options.js
│   │   ├── file.png
│   │   ├── fonts
│   │   │   ├── Lato
│   │   │   │   ├── lato-bold.eot
│   │   │   │   ├── lato-bold.ttf
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-bolditalic.eot
│   │   │   │   ├── lato-bolditalic.ttf
│   │   │   │   ├── lato-bolditalic.woff
│   │   │   │   ├── lato-bolditalic.woff2
│   │   │   │   ├── lato-italic.eot
│   │   │   │   ├── lato-italic.ttf
│   │   │   │   ├── lato-italic.woff
│   │   │   │   ├── lato-italic.woff2
│   │   │   │   ├── lato-regular.eot
│   │   │   │   ├── lato-regular.ttf
│   │   │   │   ├── lato-regular.woff
│   │   │   │   └── lato-regular.woff2
│   │   │   └── RobotoSlab
│   │   │       ├── roboto-slab-v7-bold.eot
│   │   │       ├── roboto-slab-v7-bold.ttf
│   │   │       ├── roboto-slab-v7-bold.woff
│   │   │       ├── roboto-slab-v7-bold.woff2
│   │   │       ├── roboto-slab-v7-regular.eot
│   │   │       ├── roboto-slab-v7-regular.ttf
│   │   │       ├── roboto-slab-v7-regular.woff
│   │   │       └── roboto-slab-v7-regular.woff2
│   │   ├── images
│   │   │   ├── interactive.document.png
│   │   │   ├── interactive.document.webp
│   │   │   ├── interactive.search.png
│   │   │   ├── interactive.search.webp
│   │   │   └── mcpswc.svg
│   │   ├── jquery.js
│   │   ├── js
│   │   │   ├── badge_only.js
│   │   │   ├── theme.js
│   │   │   └── versions.js
│   │   ├── language_data.js
│   │   ├── minus.png
│   │   ├── plus.png
│   │   ├── pygments.css
│   │   ├── searchtools.js
│   │   └── sphinx_highlight.js
│   ├── .buildinfo
│   ├── .nojekyll
│   ├── genindex.html
│   ├── guides
│   │   ├── archivebox.html
│   │   ├── httrack.html
│   │   ├── interrobot.html
│   │   ├── katana.html
│   │   ├── siteone.html
│   │   ├── warc.html
│   │   └── wget.html
│   ├── guides.html
│   ├── index.html
│   ├── installation.html
│   ├── interactive.html
│   ├── mcp_server_webcrawl.crawlers.archivebox.html
│   ├── mcp_server_webcrawl.crawlers.base.html
│   ├── mcp_server_webcrawl.crawlers.html
│   ├── mcp_server_webcrawl.crawlers.httrack.html
│   ├── mcp_server_webcrawl.crawlers.interrobot.html
│   ├── mcp_server_webcrawl.crawlers.katana.html
│   ├── mcp_server_webcrawl.crawlers.siteone.html
│   ├── mcp_server_webcrawl.crawlers.warc.html
│   ├── mcp_server_webcrawl.crawlers.wget.html
│   ├── mcp_server_webcrawl.extras.html
│   ├── mcp_server_webcrawl.html
│   ├── mcp_server_webcrawl.interactive.html
│   ├── mcp_server_webcrawl.models.html
│   ├── mcp_server_webcrawl.templates.html
│   ├── mcp_server_webcrawl.utils.html
│   ├── modules.html
│   ├── objects.inv
│   ├── prompts.html
│   ├── py-modindex.html
│   ├── search.html
│   ├── searchindex.js
│   └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│   ├── audit404.md
│   ├── auditfiles.md
│   ├── auditperf.md
│   ├── auditseo.md
│   ├── gopher.md
│   ├── README.md
│   └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│   ├── _static
│   │   └── images
│   │       ├── interactive.document.png
│   │       ├── interactive.document.webp
│   │       ├── interactive.search.png
│   │       ├── interactive.search.webp
│   │       └── mcpswc.svg
│   ├── _templates
│   │   └── layout.html
│   ├── conf.py
│   ├── guides
│   │   ├── archivebox.rst
│   │   ├── httrack.rst
│   │   ├── interrobot.rst
│   │   ├── katana.rst
│   │   ├── siteone.rst
│   │   ├── warc.rst
│   │   └── wget.rst
│   ├── guides.rst
│   ├── index.rst
│   ├── installation.rst
│   ├── interactive.rst
│   ├── make.bat
│   ├── Makefile
│   ├── mcp_server_webcrawl.crawlers.archivebox.rst
│   ├── mcp_server_webcrawl.crawlers.base.rst
│   ├── mcp_server_webcrawl.crawlers.httrack.rst
│   ├── mcp_server_webcrawl.crawlers.interrobot.rst
│   ├── mcp_server_webcrawl.crawlers.katana.rst
│   ├── mcp_server_webcrawl.crawlers.rst
│   ├── mcp_server_webcrawl.crawlers.siteone.rst
│   ├── mcp_server_webcrawl.crawlers.warc.rst
│   ├── mcp_server_webcrawl.crawlers.wget.rst
│   ├── mcp_server_webcrawl.extras.rst
│   ├── mcp_server_webcrawl.interactive.rst
│   ├── mcp_server_webcrawl.models.rst
│   ├── mcp_server_webcrawl.rst
│   ├── mcp_server_webcrawl.templates.rst
│   ├── mcp_server_webcrawl.utils.rst
│   ├── modules.rst
│   ├── prompts.rst
│   ├── readme.txt
│   └── usage.rst
└── src
    └── mcp_server_webcrawl
        ├── __init__.py
        ├── crawlers
        │   ├── __init__.py
        │   ├── archivebox
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── base
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── api.py
        │   │   ├── crawler.py
        │   │   ├── indexed.py
        │   │   └── tests.py
        │   ├── httrack
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── interrobot
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── katana
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── siteone
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── warc
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   └── wget
        │       ├── __init__.py
        │       ├── adapter.py
        │       ├── crawler.py
        │       └── tests.py
        ├── extras
        │   ├── __init__.py
        │   ├── markdown.py
        │   ├── regex.py
        │   ├── snippets.py
        │   ├── thumbnails.py
        │   └── xpath.py
        ├── interactive
        │   ├── __init__.py
        │   ├── highlights.py
        │   ├── search.py
        │   ├── session.py
        │   ├── ui.py
        │   └── views
        │       ├── base.py
        │       ├── document.py
        │       ├── help.py
        │       ├── requirements.py
        │       ├── results.py
        │       └── searchform.py
        ├── main.py
        ├── models
        │   ├── __init__.py
        │   ├── base.py
        │   ├── resources.py
        │   └── sites.py
        ├── settings.py
        ├── templates
        │   ├── __init__.py
        │   ├── markdown.xslt
        │   ├── tests_core.html
        │   └── tests.py
        └── utils
            ├── __init__.py
            ├── cli.py
            ├── logger.py
            ├── parser.py
            ├── parsetab.py
            ├── search.py
            ├── server.py
            ├── tests.py
            └── tools.py
```

# Files

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/siteone/adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import re
  3 | import sqlite3
  4 | 
  5 | from contextlib import closing
  6 | from datetime import datetime, timezone
  7 | from pathlib import Path
  8 | 
  9 | from mcp_server_webcrawl.crawlers.base.adapter import (
 10 |     BaseManager,
 11 |     IndexState,
 12 |     IndexStatus,
 13 |     SitesGroup,
 14 |     INDEXED_BATCH_SIZE,
 15 |     INDEXED_BYTE_MULTIPLIER,
 16 |     INDEXED_RESOURCE_DEFAULT_PROTOCOL,
 17 |     INDEXED_TYPE_MAPPING,
 18 | )
 19 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
 20 | from mcp_server_webcrawl.utils.logger import get_logger
 21 | from mcp_server_webcrawl.models.resources import (
 22 |     ResourceResult,
 23 |     ResourceResultType,
 24 |     RESOURCES_LIMIT_DEFAULT,
 25 | )
 26 | from mcp_server_webcrawl.models.sites import (
 27 |     SiteResult,
 28 | )
 29 | 
 30 | SITEONE_LOG_TYPE_MAPPING = {
 31 |     "html": ResourceResultType.PAGE,
 32 |     "redirect": ResourceResultType.PAGE,
 33 |     "image": ResourceResultType.IMAGE,
 34 |     "js": ResourceResultType.SCRIPT,
 35 |     "css": ResourceResultType.CSS,
 36 |     "video": ResourceResultType.VIDEO,
 37 |     "audio": ResourceResultType.AUDIO,
 38 |     "pdf": ResourceResultType.PDF,
 39 |     "other": ResourceResultType.OTHER,
 40 |     "font": ResourceResultType.OTHER,
 41 | }
 42 | 
 43 | logger = get_logger()
 44 | 
 45 | class SiteOneManager(IndexedManager):
 46 |     """
 47 |     Manages SiteOne directory data in in-memory SQLite databases.
 48 |     Wraps wget archive format (shared by SiteOne and wget)
 49 |     Provides connection pooling and caching for efficient access.
 50 |     """
 51 | 
 52 |     def __init__(self) -> None:
 53 |         """Initialize the SiteOne manager with empty cache and statistics."""
 54 | 
 55 |         super().__init__()
 56 | 
 57 |     def _extract_log_metadata(self, directory: Path) -> tuple[dict, dict]:
 58 |         """
 59 |         Extract metadata from SiteOne log files.
 60 | 
 61 |         Args:
 62 |             directory: path to the site directory
 63 | 
 64 |         Returns:
 65 |             Tuple of (success log data, error log data) dictionaries
 66 |         """
 67 |         directory_name: str = directory.name
 68 |         log_data = {}
 69 |         log_http_error_data = {}
 70 | 
 71 |         log_pattern: str = f"output.{directory_name}.*.txt"
 72 |         log_files = list(Path(directory.parent).glob(log_pattern))
 73 | 
 74 |         if not log_files:
 75 |             return log_data, log_http_error_data
 76 | 
 77 |         log_latest = max(log_files, key=lambda p: p.stat().st_mtime)
 78 | 
 79 |         try:
 80 |             with open(log_latest, "r", encoding="utf-8") as log_file:
 81 |                 for line in log_file:
 82 |                     parts = [part.strip() for part in line.split("|")]
 83 |                     if len(parts) == 10:
 84 |                         parts_path = parts[3].split("?")[0]
 85 |                         try:
 86 |                             status = int(parts[4])
 87 |                             url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{directory_name}{parts_path}"
 88 |                             time_str = parts[6].split()[0]
 89 |                             time = int(float(time_str) * (1000 if "s" in parts[6] else 1))
 90 | 
 91 |                             # size collected for errors, os stat preferred
 92 |                             size_str = parts[7].strip()
 93 |                             size = 0
 94 |                             if size_str:
 95 |                                 size_value = float(size_str.split()[0])
 96 |                                 size_unit = size_str.split()[1].lower() if len(size_str.split()) > 1 else "b"
 97 |                                 multiplier = INDEXED_BYTE_MULTIPLIER.get(size_unit, 1)
 98 |                                 size = int(size_value * multiplier)
 99 | 
100 |                             if 400 <= status < 600:
101 |                                 log_http_error_data[url] = {
102 |                                     "status": status,
103 |                                     "type": parts[5].lower(),
104 |                                     "time": time,
105 |                                     "size": size,
106 |                                 }
107 |                             else:
108 |                                 log_data[url] = {
109 |                                     "status": status,
110 |                                     "type": parts[5].lower(),
111 |                                     "time": time,
112 |                                     "size": size,
113 |                                 }
114 | 
115 |                         except (ValueError, IndexError, UnicodeDecodeError, KeyError):
116 |                             continue
117 | 
118 |                     elif line.strip() == "Redirected URLs":
119 |                         # stop processing we're through HTTP requests
120 |                         break
121 |         except Exception as ex:
122 |             logger.error(f"Error processing log file {log_latest}: {ex}")
123 | 
124 |         return log_data, log_http_error_data
125 | 
126 |     def _load_site_data(self, connection: sqlite3.Connection, directory: Path,
127 |             site_id: int, index_state: IndexState = None) -> None:
128 |         """
129 |         Load a SiteOne directory into the database with parallel processing and batch insertions.
130 | 
131 |         Args:
132 |             connection: SQLite connection
133 |             directory: path to the SiteOne directory
134 |             site_id: ID for the site
135 |             index_state: IndexState object for tracking progress
136 |         """
137 | 
138 |         if not directory.exists() or not directory.is_dir():
139 |             logger.error(f"Directory not found or not a directory: {directory}")
140 |             return
141 | 
142 |         if index_state is not None:
143 |             index_state.set_status(IndexStatus.INDEXING)
144 | 
145 |         log_data, log_http_error_data = self._extract_log_metadata(directory)
146 | 
147 |         file_paths = []
148 |         for root, _, files in os.walk(directory):
149 |             for filename in files:
150 |                 if filename == "robots.txt" or (filename.startswith("output.") and filename.endswith(".txt")):
151 |                     continue
152 |                 file_paths.append(Path(root) / filename)
153 | 
154 |         processed_urls = set()
155 | 
156 |         with closing(connection.cursor()) as cursor:
157 |             for i in range(0, len(file_paths), INDEXED_BATCH_SIZE):
158 |                 if index_state is not None and index_state.is_timeout():
159 |                     index_state.set_status(IndexStatus.PARTIAL)
160 |                     return
161 | 
162 |                 batch_paths = file_paths[i:i+INDEXED_BATCH_SIZE]
163 |                 batch_insert_crawled: list[ResourceResult] = []
164 |                 file_contents = BaseManager.read_files(batch_paths)
165 |                 for file_path in batch_paths:
166 |                     try:
167 |                         result: ResourceResult | None = self._prepare_siteone_record(file_path,
168 |                                 site_id, directory, log_data, file_contents.get(file_path))
169 |                         if result and result.url not in processed_urls:
170 |                             batch_insert_crawled.append(result)
171 |                             processed_urls.add(result.url)
172 |                             if index_state is not None:
173 |                                 index_state.increment_processed()
174 |                     except Exception as ex:
175 |                         logger.error(f"Error processing file {file_path}: {ex}")
176 | 
177 |                 self._execute_batch_insert(connection, cursor, batch_insert_crawled)
178 | 
179 |             # HTTP errors not already processed
180 |             batch_insert_errors: list[ResourceResult] = []
181 |             for url, meta in log_http_error_data.items():
182 |                 if url not in processed_urls:
183 |                     size = meta.get("size", 0)
184 |                     result = ResourceResult(
185 |                         id=BaseManager.string_to_id(url),
186 |                         site=site_id,
187 |                         url=url,
188 |                         type=ResourceResultType.OTHER,
189 |                         status=meta["status"],
190 |                         headers=BaseManager.get_basic_headers(size, ResourceResultType.OTHER, file_path),
191 |                         content="",     # no content
192 |                         size=size,      # size from log
193 |                         time=meta["time"]
194 |                     )
195 |                     batch_insert_errors.append(result)
196 | 
197 |                     if index_state is not None:
198 |                         index_state.increment_processed()
199 | 
200 |                     # errors in batches too
201 |                     if len(batch_insert_errors) >= INDEXED_BATCH_SIZE:
202 |                         self._execute_batch_insert(connection, cursor, batch_insert_errors)
203 | 
204 |             # insert any remaining error records
205 |             if batch_insert_errors:
206 |                 self._execute_batch_insert(connection, cursor, batch_insert_errors)
207 | 
208 |             if index_state is not None and index_state.status == IndexStatus.INDEXING:
209 |                 index_state.set_status(IndexStatus.COMPLETE)
210 | 
211 |     def _prepare_siteone_record(self, file_path: Path, site_id: int, base_dir: Path,
212 |                             log_data: dict, content: str = None) -> ResourceResult | None:
213 |         """
214 |         Prepare a record for batch insertion from a SiteOne file.
215 | 
216 |         Args:
217 |             file_path: path to the file
218 |             site_id: id for the site
219 |             base_dir: base directory for the capture
220 |             log_data: dictionary of metadata from logs keyed by URL
221 |             content: optional pre-loaded file content
222 | 
223 |         Returns:
224 |             Tuple of (record tuple, URL) or None if processing fails
225 |         """
226 |         try:
227 |             # generate relative url path from file path (similar to wget)
228 |             relative_path = file_path.relative_to(base_dir)
229 |             url = f"{INDEXED_RESOURCE_DEFAULT_PROTOCOL}{base_dir.name}/{str(relative_path).replace(os.sep, '/')}"
230 | 
231 |             if file_path.is_file():
232 |                 file_stat = file_path.stat()
233 |                 file_size = file_stat.st_size
234 |                 file_created = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
235 |                 file_modified = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
236 |             else:
237 |                 file_created = None
238 |                 file_modified = None
239 |                 file_size = 0
240 | 
241 |             decruftified_path = BaseManager.decruft_path(str(file_path))
242 |             extension = Path(decruftified_path).suffix.lower()
243 |             wget_static_pattern = re.compile(r"\.[0-9a-f]{8,}\.")
244 | 
245 |             # look up metadata from log if available, otherwise use defaults
246 |             metadata = None
247 |             wget_aliases = list(set([
248 |                 url,                                   # exact match first
249 |                 re.sub(wget_static_pattern, ".", url), # static pattern
250 |                 url.replace(".html", ""),              # file without extension (redirects)
251 |                 url.replace(".html", "/"),             # directory style (targets)
252 |                 url.replace("index.html", ""),         # index removal
253 |             ]))
254 | 
255 |             for wget_alias in wget_aliases:
256 |                 metadata = log_data.get(wget_alias, None)
257 |                 if metadata is not None:
258 |                     break
259 | 
260 |             if metadata is not None:
261 |                 # preventing duplicate html pages ./appstat.html and ./appstat/index.html
262 |                 # prefer index.html (actual content) over redirect stubs
263 |                 canonical_url = None
264 |                 # Sort aliases to prefer index.html files over redirect stubs
265 |                 sorted_aliases = sorted([alias for alias in wget_aliases if log_data.get(alias) == metadata],
266 |                                     key=lambda x: (not x.endswith('index.html'), x))
267 | 
268 |                 if sorted_aliases:
269 |                     canonical_url = sorted_aliases[0]  # Take the preferred one
270 |                     url = canonical_url
271 |             else:
272 |                 metadata = {}
273 | 
274 |             status_code = metadata.get("status", 200)
275 |             response_time = metadata.get("time", 0)
276 |             log_type = metadata.get("type", "").lower()
277 | 
278 |             if log_type:
279 |                 # no type for redirects, but more often than not
280 |                 # redirection to another page
281 |                 resource_type = SITEONE_LOG_TYPE_MAPPING.get(log_type,
282 |                         INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER))
283 |             else:
284 |                 # fallback to extension-based mapping
285 |                 resource_type = INDEXED_TYPE_MAPPING.get(extension, ResourceResultType.OTHER)
286 | 
287 |             file_content = content
288 |             if file_content is None:
289 |                 file_content = BaseManager.read_file_contents(file_path, resource_type)
290 | 
291 |             # skip redirect stub files left in SiteOne archive (duplicate, wait for real content)
292 |             if status_code == 200 and file_content and '<meta http-equiv="refresh" content="0' in file_content:
293 |                 return None
294 | 
295 |             record = ResourceResult(
296 |                 id=BaseManager.string_to_id(url),
297 |                 site=site_id,
298 |                 created=file_created,
299 |                 modified=file_modified,
300 |                 url=url,
301 |                 type=resource_type,
302 |                 status=status_code,
303 |                 headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
304 |                 content=file_content,
305 |                 size=file_size,
306 |                 time=response_time  # possibly from log
307 |             )
308 |             return record
309 |         except Exception as ex:
310 |             logger.error(f"Error preparing record for file {file_path}: {ex}")
311 |             return None
312 | 
313 | manager: SiteOneManager = SiteOneManager()
314 | 
315 | def get_sites(
316 |         datasrc: Path,
317 |         ids: list[int] | None = None,
318 |         fields: list[str] | None = None
319 |     ) -> list[SiteResult]:
320 |     """
321 |     List site directories in the datasrc directory as sites.
322 | 
323 |     Args:
324 |         datasrc: path to the directory containing site subdirectories
325 |         ids: optional list of site IDs to filter by
326 |         fields: optional list of fields to include in the response
327 | 
328 |     Returns:
329 |         List of SiteResult objects, one for each site directory
330 | 
331 |     Notes:
332 |         Returns an empty list if the datasrc directory doesn't exist.
333 |     """
334 |     return manager.get_sites_for_directories(datasrc, ids, fields)
335 | 
336 | def get_resources(
337 |     datasrc: Path,
338 |     sites: list[int] | None = None,
339 |     query: str = "",
340 |     fields: list[str] | None = None,
341 |     sort: str | None = None,
342 |     limit: int = RESOURCES_LIMIT_DEFAULT,
343 |     offset: int = 0,
344 | ) -> tuple[list[ResourceResult], int, IndexState]:
345 |     """
346 |     Get resources from wget directories using in-memory SQLite.
347 | 
348 |     Args:
349 |         datasrc: path to the directory containing wget captures
350 |         sites: optional list of site IDs to filter by
351 |         query: search query string
352 |         fields: optional list of fields to include in response
353 |         sort: sort order for results
354 |         limit: maximum number of results to return
355 |         offset: number of results to skip for pagination
356 | 
357 |     Returns:
358 |         Tuple of (list of ResourceResult objects, total count)
359 |     """
360 |     sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
361 |     assert sites_results, "At least one site is required to search"
362 |     site_paths = [site.path for site in sites_results]
363 |     sites_group = SitesGroup(datasrc, sites, site_paths)
364 |     return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
365 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/parser.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | 
  3 | from ply import lex
  4 | from ply import yacc
  5 | from logging import Logger
  6 | 
  7 | from mcp_server_webcrawl.models.resources import RESOURCES_DEFAULT_FIELD_MAPPING
  8 | from mcp_server_webcrawl.utils.logger import get_logger
  9 | 
 10 | logger: Logger = get_logger()
 11 | 
 12 | class SearchSubquery:
 13 |     """
 14 |     Subquery component in a structured search.
 15 | 
 16 |     These are grouped into an ordered list, and are the basis the SQL query.
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         field: str | None,
 22 |         value: str | int,
 23 |         type: str,
 24 |         modifiers: list[str] | None,
 25 |         operator: str | None,
 26 |         comparator: str = "=",
 27 |         group: int | None = None,
 28 |     ):
 29 |         """
 30 |         Initialize a SearchSubquery instance.
 31 | 
 32 |         Args:
 33 |             field: field to search, or None for fulltext search
 34 |             value: search value (string or integer)
 35 |             type: value type (term, phrase, wildcard, etc.)
 36 |             modifiers: list of modifiers applied to the query (e.g., 'NOT')
 37 |             operator: boolean operator connecting to the next subquery ('AND', 'OR', or None)
 38 |             comparator: comparison operator for numerics ('=', '>', '>=', '<', '<=', '!=')
 39 |         """
 40 |         self.field: str | None = field
 41 |         self.value: str | int = value
 42 |         self.type: str = type
 43 |         self.modifiers: list[str] = modifiers or []
 44 |         self.operator: str | None = operator or None
 45 |         self.comparator: str = comparator
 46 |         self.group: int | None = group
 47 | 
 48 |     def get_safe_sql_field(self, field: str) -> str:
 49 |         if field in RESOURCES_DEFAULT_FIELD_MAPPING:
 50 |             return RESOURCES_DEFAULT_FIELD_MAPPING[field]
 51 |         else:
 52 |             logger.error(f"Field {field} failed to validate.")
 53 |             raise Exception(f"Unknown database field {field}")
 54 | 
 55 |     def to_dict(self) -> dict[str, str | int | list[str] | None]:
 56 |         """
 57 |         Convert SearchSubquery to dictionary representation.
 58 | 
 59 |         Args:
 60 |             field: Field name to use in the dictionary (overrides self.field)
 61 | 
 62 |         Returns:
 63 |             Dictionary containing all SearchSubquery attributes
 64 |         """
 65 |         return {
 66 |             "field": self.field,
 67 |             "value": self.value,
 68 |             "type": self.type,
 69 |             "modifiers": self.modifiers,
 70 |             "operator": self.operator,
 71 |             "comparator": self.comparator,
 72 |             "group": self.group,
 73 |         }
 74 | 
 75 | class SearchLexer:
 76 |     tokens = (
 77 |         "FIELD",         # e.g. url:, content:
 78 |         "QUOTED_STRING", # "hello world"
 79 |         "TERM",          # standard search term
 80 |         "WILDCARD",      # wildcards terms, e.g. search*
 81 |         "AND",
 82 |         "OR",
 83 |         "NOT",
 84 |         "LPAREN",        # (
 85 |         "RPAREN",        # )
 86 |         "COLON",         # :
 87 |         "COMPARATOR",    # :>=, :>, :<, etc.
 88 |         "COMP_OP",       # >=
 89 |         "URL_FIELD"
 90 |     )
 91 | 
 92 |     valid_fields: list[str] = ["id", "url", "status", "type", "size", "headers", "content", "time"]
 93 | 
 94 |     t_LPAREN = r"\("
 95 |     t_RPAREN = r"\)"
 96 |     t_ignore = " \t\n"
 97 | 
 98 |     def __init__(self):
 99 |         self.lexer = lex.lex(module=self)
100 | 
101 |     def t_COMPARATOR(self, token: lex.LexToken) -> lex.LexToken:
102 |         r":(?:>=|>|<=|<|!=|=)"
103 |         token.value = token.value[1:]  # strip colon
104 |         return token
105 | 
106 |     def t_COLON(self, token: lex.LexToken) -> lex.LexToken:
107 |         r":"
108 |         return token
109 | 
110 |     def t_QUOTED_STRING(self, token: lex.LexToken) -> lex.LexToken:
111 |         r'"[^"]*"'
112 |         token.value = token.value[1:-1]
113 |         return token
114 | 
115 |     # precedence matters
116 |     def t_URL_FIELD(self, token: lex.LexToken) -> lex.LexToken:
117 |         # this field must terminate not only on url end, but on parens
118 |         r"url\s*:\s*((?:https?://)?[^\s()]+)"
119 |         token.type = "URL_FIELD"
120 |         url_value = token.value[token.value.find(':')+1:].strip()
121 |         token.value = ("url", url_value)
122 |         return token
123 | 
124 |     # precedence matters
125 |     def t_FIELD(self, token: lex.LexToken) -> lex.LexToken:
126 |         r"[a-zA-Z_][a-zA-Z0-9_]*(?=\s*:)"
127 |         if token.value not in self.valid_fields:
128 |             raise ValueError(f"Invalid field: {token.value}. Valid fields are: {', '.join(self.valid_fields)}")
129 |         return token
130 | 
131 |     def t_AND(self, token: lex.LexToken) -> lex.LexToken:
132 |         r"AND\b"
133 |         return token
134 | 
135 |     def t_OR(self, token: lex.LexToken) -> lex.LexToken:
136 |         r"OR\b"
137 |         return token
138 | 
139 |     def t_NOT(self, token: lex.LexToken) -> lex.LexToken:
140 |         r"NOT\b"
141 |         return token
142 | 
143 |     def t_WILDCARD(self, token: lex.LexToken) -> lex.LexToken:
144 |         r"[a-zA-Z0-9_\.\-\/\+]+\*"
145 |         token.value = token.value[:-1]
146 |         return token
147 | 
148 |     def t_TERM(self, token: lex.LexToken) -> lex.LexToken:
149 |         r"[a-zA-Z0-9_\.\-\/\+]+"
150 |         # dedicated t_AND, t_OR, t_NOT to handle those
151 |         # this is fts5 workaround, -_ are tokenizer preserves
152 |         if re.match(r"^[\w]+[\-_][\-_\w]+$", token.value, re.UNICODE):
153 |             token.type = "QUOTED_STRING"
154 |         return token
155 | 
156 |     def t_COMP_OP(self, token: lex.LexToken) -> lex.LexToken:
157 |         r">=|>|<=|<|!=|="
158 |         return token
159 | 
160 |     def t_error(self, token: lex.LexToken) -> None:
161 |         logger.error(f"Illegal character '{token.value[0]}'")
162 |         token.lexer.skip(1)
163 | 
164 | class SearchParser:
165 |     tokens = SearchLexer.tokens
166 | 
167 |     precedence = (
168 |         ('right', 'NOT'),
169 |         ('left', 'AND'),
170 |         ('left', 'OR'),
171 |     )
172 | 
173 |     numeric_fields: list[str] = ["id", "status", "size", "time"]
174 | 
175 |     def __init__(self, lexer):
176 |         self.lexer = lexer
177 |         self.parser = yacc.yacc(module=self, debug=False)
178 | 
179 |     def p_query(self, production: yacc.YaccProduction) -> None:
180 |         """
181 |         query : expression
182 |         """
183 |         production[0] = production[1]
184 | 
185 |     def p_expression_binary(self, production: yacc.YaccProduction) -> None:
186 |         """
187 |         expression : expression AND expression
188 |                     | expression OR expression
189 |                     | expression NOT expression
190 |         """
191 | 
192 |         operator = production[2]
193 |         left = production[1]
194 |         right = production[3]
195 | 
196 |         # special handling for AND NOT pattern
197 |         # A AND (NOT B), treat it like A NOT B
198 |         if (operator == "AND" and isinstance(right, list) and
199 |                 len(right) == 1 and "NOT" in right[0].modifiers):
200 |             # convert AND (NOT B) to binary NOT
201 |             # remove NOT modifiers
202 |             right[0].modifiers = [m for m in right[0].modifiers if m != "NOT"]
203 |             operator = "NOT"
204 | 
205 |         if operator == "NOT":
206 |             # NOT handled as set difference, left EXCEPT right
207 |             # mark this as a special NOT relationship
208 |             if isinstance(left, list) and isinstance(right, list):
209 |                 if left:
210 |                     left[-1].operator = "NOT"
211 |                 production[0] = left + right
212 |             elif isinstance(left, list):
213 |                 if left:
214 |                     left[-1].operator = "NOT"
215 |                 production[0] = left + [self.__create_subquery(right, None)]
216 |             elif isinstance(right, list):
217 |                 production[0] = [self.__create_subquery(left, "NOT")] + right
218 |             else:
219 |                 # both terms, subqueries for both
220 |                 production[0] = [
221 |                     self.__create_subquery(left, "NOT"),
222 |                     self.__create_subquery(right, None)
223 |                 ]
224 |         else:
225 |             # handle AND and OR as before
226 |             if isinstance(left, list) and isinstance(right, list):
227 |                 if left:
228 |                     left[-1].operator = operator
229 |                 production[0] = left + right
230 |             elif isinstance(left, list):
231 |                 if left:
232 |                     left[-1].operator = operator
233 |                 production[0] = left + [self.__create_subquery(right, operator)]
234 |             elif isinstance(right, list):
235 |                 production[0] = [self.__create_subquery(left, operator)] + right
236 |             else:
237 |                 production[0] = [
238 |                     self.__create_subquery(left, operator),
239 |                     self.__create_subquery(right, None)
240 |                 ]
241 | 
242 |     def p_expression_not(self, production: yacc.YaccProduction) -> None:
243 |         """
244 |         expression : NOT expression
245 |         """
246 |         # handle unary NOT (prefix NOT)
247 |         expr = production[2]
248 |         if isinstance(expr, list):
249 |             for item in expr:
250 |                 item.modifiers.append("NOT")
251 |             production[0] = expr
252 |         else:
253 |             subquery = self.__create_subquery(expr, None)
254 |             subquery.modifiers.append("NOT")
255 |             production[0] = [subquery]
256 | 
257 |     def p_expression_group(self, production: yacc.YaccProduction) -> None:
258 |         """
259 |         expression : LPAREN expression RPAREN
260 |         """
261 |         # production[0] = production[2]
262 |         expr = production[2]
263 |         group_id = id(production)  # Unique ID for this parentheses group
264 | 
265 |         # Mark all subqueries in this expression with the group
266 |         if isinstance(expr, list):
267 |             for subquery in expr:
268 |                 subquery.group = group_id
269 |         else:
270 |             expr.group = group_id
271 | 
272 |         production[0] = expr
273 | 
274 |     def p_expression_url_field(self, production: yacc.YaccProduction) -> None:
275 |         """
276 |         expression : URL_FIELD
277 |         """
278 | 
279 |         field, value = production[1]  # Unpack the tuple (field, value)
280 | 
281 |         # check if URL ends with * for wildcard matching
282 |         value_type = "term"
283 |         if value.endswith('*'):
284 |             value = value[:-1]  # remove wildcard
285 |             value_type = "wildcard"
286 | 
287 |         production[0] = SearchSubquery(
288 |             field=field,
289 |             value=value,
290 |             type=value_type,
291 |             modifiers=[],
292 |             operator=None
293 |         )
294 | 
295 |     def p_value(self, production: yacc.YaccProduction) -> None:
296 |         """
297 |         value : TERM
298 |               | WILDCARD
299 |               | QUOTED_STRING
300 |         """
301 |         value = production[1]
302 |         value_type = "term"
303 | 
304 |         if production.slice[1].type == "WILDCARD":
305 |             value_type = "wildcard"
306 |         elif production.slice[1].type == "QUOTED_STRING":
307 |             value_type = "phrase"
308 | 
309 |         production[0] = {"value": value, "type": value_type}
310 | 
311 |     def p_expression_term(self, production: yacc.YaccProduction) -> None:
312 |         """
313 |         expression : value
314 |         """
315 | 
316 |         term = production[1]
317 |         production[0] = SearchSubquery(
318 |             field=None,  # no field means fulltext search
319 |             value=term["value"],
320 |             type=term["type"],
321 |             modifiers=[],
322 |             operator=None
323 |         )
324 | 
325 |     def p_expression_field_search(self, production: yacc.YaccProduction) -> None:
326 |         """
327 |         expression : FIELD COLON COMP_OP value
328 |                 | FIELD COLON value
329 |                 | FIELD COMPARATOR value
330 |         """
331 |         field = production[1]
332 | 
333 |         # determine comparator and value based on pattern
334 |         if len(production) == 5:  # FIELD COLON COMP_OP value
335 |             comparator = production[3]
336 |             value = production[4]
337 |         elif len(production) == 4:
338 |             # check second token, COLON or COMPARATOR
339 |             if production[2] == ":":  # FIELD COLON value
340 |                 comparator = "="  # default equals
341 |                 value = production[3]
342 |             else:
343 |                 comparator = production[2]
344 |                 value = production[3]
345 | 
346 |         production[0] = self.__create_field_subquery(field, value, comparator)
347 | 
348 |     def __create_field_subquery(self, field: str, value_dict: dict[str, str] | str | int, comparator: str = "=") -> SearchSubquery:
349 |         """
350 |         Helper method to create SearchSubquery for field searches.
351 |         Consolidates all the validation and conversion logic.
352 |         """
353 | 
354 |         self.__validate_comparator_for_field(field, comparator)
355 |         processed_value = self.__process_field_value(field, value_dict)
356 |         value_type = value_dict.get("type", "term") if isinstance(value_dict, dict) else "term"
357 | 
358 |         return SearchSubquery(
359 |             field=field,
360 |             value=processed_value,
361 |             type=value_type,
362 |             modifiers=[],
363 |             operator=None,
364 |             comparator=comparator
365 |         )
366 | 
367 |     def __create_subquery(self, term, operator: str | None):
368 |         """
369 |         Helper to create a SearchSubquery instance.
370 |         """
371 |         assert isinstance(term, SearchSubquery), "__create_subquery expected a SearchSubquery instance"
372 |         return SearchSubquery(
373 |             field=term.field,
374 |             value=term.value,
375 |             type=term.type,
376 |             modifiers=term.modifiers.copy(),
377 |             operator=operator,
378 |             comparator=term.comparator,
379 |             group=term.group,
380 |         )
381 | 
382 |     def __process_field_value(
383 |         self,
384 |         field: str | None,
385 |         value_dict: dict[str, str] | str | int,
386 |         swap_values: dict[str, dict[str, str | int]] | None = None
387 |     ) -> str | int | float:
388 |         """
389 |         Process and validate a field value with type conversion and swapping.
390 | 
391 |         Args:
392 |             field: The field name (or None for fulltext)
393 |             value_dict: Dictionary with 'value' and 'type' keys, or raw value
394 |             swap_values: Optional dictionary for value replacement
395 | 
396 |         Returns:
397 |             Processed value (string, int, or float)
398 |         """
399 |         if isinstance(value_dict, dict):
400 |             value = value_dict["value"]
401 |         else:
402 |             value = value_dict # raw value
403 | 
404 |         if swap_values:
405 |             swap_key = field if field else ""
406 |             if swap_key in swap_values and value in swap_values[swap_key]:
407 |                 value = swap_values[swap_key][value]
408 | 
409 |         if field and field in self.numeric_fields:
410 |             try:
411 |                 return int(value)
412 |             except ValueError:
413 |                 try:
414 |                     return float(value)
415 |                 except ValueError:
416 |                     raise ValueError(f"Field {field} requires a numeric value, got: {value}")
417 | 
418 |         return value
419 | 
420 |     def __validate_comparator_for_field(self, field: str, comparator: str) -> None:
421 |         """
422 |         Validate that a comparator is appropriate for the given field.
423 | 
424 |         Args:
425 |             field: The field name
426 |             comparator: The comparison operator
427 | 
428 |         Raises:
429 |             ValueError: If comparator is invalid for the field type
430 |         """
431 |         if comparator != "=" and field not in self.numeric_fields:
432 |             raise ValueError(f"Comparison operator '{comparator}' can only be used with numeric fields")
433 | 
434 |     def p_error(self, production: yacc.YaccProduction | None) -> None:
435 |         if production:
436 |             logger.info(f"Syntax error at '{production.value}'")
437 |         else:
438 |             logger.info("Syntax error at EOF")
439 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/requirements.py:
--------------------------------------------------------------------------------

```python
  1 | import curses
  2 | import os
  3 | import traceback
  4 | 
  5 | from enum import Enum, auto
  6 | from pathlib import Path
  7 | from typing import TYPE_CHECKING
  8 | 
  9 | from mcp_server_webcrawl.crawlers import VALID_CRAWLER_CHOICES, get_crawler
 10 | from mcp_server_webcrawl.crawlers.base.api import BaseJsonApi
 11 | from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
 12 | from mcp_server_webcrawl.interactive.ui import InputRadioGroup, InputText, ThemeDefinition, UiState
 13 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView
 14 | from mcp_server_webcrawl.interactive.ui import safe_addstr
 15 | from mcp_server_webcrawl.interactive.views.searchform import SearchFormView
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from mcp_server_webcrawl.interactive.session import InteractiveSession
 19 | 
 20 | LAYOUT_BOX_MAX_WIDTH = 60
 21 | LAYOUT_BOX_MARGIN = 8
 22 | VALIDATION_HEADER_X_OFFSET = 24
 23 | VALIDATION_TEXT_INDENT = 2
 24 | 
 25 | class RequirementsFormField(Enum):
 26 |     DATASRC = auto()
 27 |     CRAWLER = auto()
 28 | 
 29 | class RequirementsView(BaseCursesView):
 30 |     """
 31 |     Interactive requirements view for configuring crawler and data source.
 32 |     """
 33 | 
 34 |     def __init__(self, session: 'InteractiveSession', crawler: str, datasrc: str):
 35 |         """
 36 |         Initialize the requirements view.
 37 |         
 38 |         Args:
 39 |             session: The interactive session instance
 40 |             crawler: Initial crawler type selection
 41 |             datasrc: Initial data source path
 42 |         """
 43 |         super().__init__(session)
 44 |         self.__validated: bool = self.__validate(crawler, datasrc)
 45 |         self.__form_selected_field: RequirementsFormField = RequirementsFormField.DATASRC
 46 |         self.__form_selected_index: int = 0
 47 | 
 48 |         initial_datasrc: str = datasrc if datasrc is not None else self.__get_default_directory()
 49 |         self.__datasrc_input: InputText = InputText(initial_value=initial_datasrc, label="Data Source Path")
 50 | 
 51 |         self.__crawler_group: InputRadioGroup = InputRadioGroup("crawler")
 52 | 
 53 |         if not self.__validated:
 54 |             detected_crawler: str | None
 55 |             detected_datasrc: str | None
 56 |             detected_crawler, detected_datasrc = self.__autosense_crawler_and_datasrc()
 57 |             initial_crawler: str = crawler if crawler is not None else detected_crawler
 58 |             initial_datasrc = datasrc if datasrc is not None else detected_datasrc
 59 |             self.__set_initial_crawler_selection(initial_crawler)
 60 |             self.__datasrc_input.set_value(initial_datasrc)
 61 |             self._focused: bool = True
 62 | 
 63 |     @property
 64 |     def validated(self) -> bool:
 65 |         return self.__validated
 66 | 
 67 |     def handle_input(self, key: int) -> bool:
 68 |         """
 69 |         Handle keyboard input for requirements form navigation and validation.
 70 |         
 71 |         Args:
 72 |             key: The curses key code from user input
 73 |             
 74 |         Returns:
 75 |             bool: True if the input was handled, False otherwise
 76 |         """
 77 | 
 78 |         handlers: dict[int, callable] = {
 79 |             curses.KEY_UP: self.__navigate_form_selection_up,
 80 |             curses.KEY_DOWN: self.__navigate_form_selection_down,
 81 |             ord('\t'): self.__handle_tab,
 82 |             ord(' '): self.__handle_spacebar,
 83 |             ord('\n'): self.__handle_enter,
 84 |             ord('\r'): self.__handle_enter,
 85 |         }
 86 | 
 87 |         handler = handlers.get(key)
 88 |         if handler:
 89 |             handler()
 90 |             return True
 91 | 
 92 |         if (self.__form_selected_field == RequirementsFormField.DATASRC and
 93 |             self.__form_selected_index == 0):
 94 |             return self.__datasrc_input.handle_input(key)
 95 | 
 96 |         return False
 97 | 
 98 |     def render(self, stdscr: curses.window) -> None:
 99 |         """
100 |         Render the requirements form showing crawler selection and datasrc input.
101 |         
102 |         Args:
103 |             stdscr: The curses window to draw on
104 |         """
105 |         xb: int = self.bounds.x
106 |         yb: int = self.bounds.y
107 |         y_current: int = yb + 2
108 |         # y_max: int = yb + self.bounds.height
109 | 
110 |         safe_addstr(stdscr, y_current, xb + 2, "Data Source Path:", curses.A_BOLD)
111 |         y_current += 1
112 | 
113 |         box_width: int = min(LAYOUT_BOX_MAX_WIDTH, self.bounds.width - LAYOUT_BOX_MARGIN)
114 |         is_datasrc_selected: bool = (
115 |                 self.__form_selected_field == RequirementsFormField.DATASRC
116 |                 and self.__form_selected_index == 0
117 |         )
118 |         field_style: int
119 |         if is_datasrc_selected:
120 |             field_style = curses.A_REVERSE
121 |         else:
122 |             field_style = self.session.get_theme_color_pair(ThemeDefinition.INACTIVE_QUERY)
123 | 
124 |         self.__datasrc_input.render(stdscr, y_current, xb + 4, box_width,
125 |                 focused=is_datasrc_selected, style=field_style)
126 | 
127 |         y_current += 2
128 | 
129 |         crawler_y_start: int = y_current
130 | 
131 |         safe_addstr(stdscr, y_current, xb + 2, self.__crawler_group.label, curses.A_BOLD)
132 |         y_current += 1
133 | 
134 |         for i, radio in enumerate(self.__crawler_group.radios):
135 |             crawler_field_index: int = i + 1
136 |             is_crawler_field_selected: bool = (self.__form_selected_field == RequirementsFormField.CRAWLER and
137 |                     self.__form_selected_index == crawler_field_index)
138 | 
139 |             radio.render(stdscr, y_current, xb + 4, crawler_field_index, 100, is_crawler_field_selected)
140 |             y_current += 1
141 | 
142 |         validation_y: int = crawler_y_start
143 | 
144 |         selected_crawler: str = self.__crawler_group.value
145 |         crawler_valid: bool = selected_crawler in VALID_CRAWLER_CHOICES
146 |         crawler_symbol: str = "🗹" if crawler_valid else "☒"
147 | 
148 |         crawler_style: int
149 |         if crawler_valid:
150 |             crawler_style = curses.A_NORMAL
151 |         else:
152 |             crawler_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
153 | 
154 |         datasrc_path: str = self.__datasrc_input.value
155 |         datasrc_path_obj: Path = Path(datasrc_path)
156 |         datasrc_exists: bool = datasrc_path_obj.exists()
157 | 
158 |         datasrc_symbol: str
159 |         datasrc_valid: bool
160 |         if not datasrc_exists:
161 |             datasrc_symbol = "☒"
162 |             datasrc_valid = False
163 |         else:
164 |             is_correct_type: bool
165 |             if selected_crawler in ("interrobot", "warc"):
166 |                 is_correct_type = datasrc_path_obj.is_file()
167 |             else:
168 |                 is_correct_type = datasrc_path_obj.is_dir()
169 | 
170 |             datasrc_symbol = "🗹" if is_correct_type else "☒"
171 |             datasrc_valid = is_correct_type
172 | 
173 |         datasrc_style: int
174 |         if datasrc_valid:
175 |             datasrc_style = curses.A_NORMAL
176 |         else:
177 |             datasrc_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
178 | 
179 |         validation_header: str = "Validation Status:"
180 |         header_x: int = xb + VALIDATION_HEADER_X_OFFSET
181 |         safe_addstr(stdscr, validation_y, header_x, validation_header, curses.A_BOLD)
182 |         validation_y += 1
183 | 
184 |         validation_word_x: int = header_x
185 |         crawler_text: str = f"{crawler_symbol}  --crawler"
186 |         safe_addstr(stdscr, validation_y, validation_word_x, "  ", curses.A_NORMAL)
187 |         safe_addstr(stdscr, validation_y, validation_word_x + VALIDATION_TEXT_INDENT, crawler_text, crawler_style)
188 |         validation_y += 1
189 | 
190 |         datasrc_text: str = f"{datasrc_symbol}  --datasrc"
191 |         safe_addstr(stdscr, validation_y, validation_word_x, "  ", curses.A_NORMAL)
192 |         safe_addstr(stdscr, validation_y, validation_word_x + VALIDATION_TEXT_INDENT, datasrc_text, datasrc_style)
193 | 
194 |     def __autosense_crawler_and_datasrc(self) -> tuple[str, str] | tuple[None, None]:
195 |         """
196 |         Auto-detect crawler type and datasrc based on cwd and parent directory signatures.
197 |         
198 |         Returns:
199 |             tuple: (crawler, datasrc) tuple or (None, None) if no match found
200 |         """
201 |         cwd: Path = Path(os.getcwd()).absolute()
202 | 
203 |         if list(cwd.glob("*.v2.db")):
204 |             db_file: Path = next(cwd.glob("*.v2.db"))
205 |             return ("interrobot", str(db_file))
206 | 
207 |         archive_directories: list[Path] = list(cwd.glob("*/archive"))
208 |         if archive_directories:
209 |             for archive_directory in archive_directories:
210 |                 timestamp_directories: list[Path] = [d for d in archive_directory.iterdir()
211 |                         if d.is_dir() and d.name.replace('.', '').isdigit()]
212 |                 if timestamp_directories:
213 |                     return ("archivebox", str(cwd))
214 | 
215 |         if list(cwd.glob("*/output.*.txt")):
216 |             return ("siteone", str(cwd))
217 | 
218 |         if list(cwd.glob("*/hts-log.txt")) or list(cwd.glob("*/*/hts-log.txt")):
219 |             return ("httrack", str(cwd))
220 | 
221 |         katana_files: list[Path] = list(cwd.glob("*/*/*.txt"))
222 |         for f in katana_files:
223 |             if len(f.stem) == 40 and all(c in '0123456789abcdef' for c in f.stem.lower()):
224 |                 return ("katana", str(cwd))
225 | 
226 |         warc_files: list[Path] = list(cwd.glob("*.warc.gz")) + list(cwd.glob("*.warc"))
227 |         if warc_files:
228 |             return ("warc", str(cwd))
229 | 
230 |         if list(cwd.glob("*/index.html")):
231 |             return ("wget", str(cwd))
232 | 
233 |         return ("wget", self.__get_default_directory())
234 | 
235 |     def __get_default_directory(self) -> str:
236 |         """
237 |         Get the default directory path.
238 |         
239 |         Returns:
240 |             str: The absolute path of the current working directory
241 |         """
242 |         return str(Path(os.getcwd()).absolute())
243 | 
244 |     def __handle_enter(self) -> None:
245 |         """
246 |         Handle ENTER key to revalidate in datasrc field or toggle in crawler field.
247 |         """
248 |         if self.__form_selected_field == RequirementsFormField.DATASRC:
249 |             selected_crawler: str = self.__crawler_group.value
250 |             self.__validated = self.__validate(selected_crawler, self.__datasrc_input.value)
251 |             self.__update_session()
252 |             if self.__validated:
253 |                 self.session.set_ui_state(UiState.SEARCH_INIT)
254 |         elif self.__form_selected_field == RequirementsFormField.CRAWLER:
255 |             crawler_index: int = self.__form_selected_index - 1
256 |             if 0 <= crawler_index < len(self.__crawler_group.radios):
257 |                 self.__crawler_group.radios[crawler_index].next_state()
258 | 
259 |     def __handle_spacebar(self) -> None:
260 |         """
261 |         Handle spacebar to toggle crawler selection or add space to datasrc.
262 |         """
263 |         if self.__form_selected_field == RequirementsFormField.DATASRC:
264 |             self.__datasrc_input.handle_input(ord(" "))
265 |         elif self.__form_selected_field == RequirementsFormField.CRAWLER:
266 |             crawler_index: int = self.__form_selected_index - 1
267 |             if 0 <= crawler_index < len(self.__crawler_group.radios):
268 |                 self.__crawler_group.radios[crawler_index].next_state()
269 | 
270 |     def __handle_tab(self) -> None:
271 |         """
272 |         Handle TAB key to switch between field groups.
273 |         """
274 |         if self.__form_selected_field == RequirementsFormField.DATASRC:
275 |             self.__form_selected_field = RequirementsFormField.CRAWLER
276 |             self.__form_selected_index = 1
277 |         else:
278 |             self.__form_selected_field = RequirementsFormField.DATASRC
279 |             self.__form_selected_index = 0
280 | 
281 |     def __navigate_form_selection_down(self) -> None:
282 |         """
283 |         Navigate down within current field or switch to next field group.
284 |         """
285 |         if self.__form_selected_field == RequirementsFormField.DATASRC:
286 |             self.__form_selected_field = RequirementsFormField.CRAWLER
287 |             self.__form_selected_index = 1
288 |         elif self.__form_selected_field == RequirementsFormField.CRAWLER:
289 |             if self.__form_selected_index < len(self.__crawler_group.radios):
290 |                 self.__form_selected_index += 1
291 |             else:
292 |                 self.__form_selected_field = RequirementsFormField.DATASRC
293 |                 self.__form_selected_index = 0
294 | 
295 |     def __navigate_form_selection_up(self) -> None:
296 |         """
297 |         Navigate up within current field or switch to previous field group.
298 |         """
299 |         if self.__form_selected_field == RequirementsFormField.DATASRC:
300 |             self.__form_selected_field = RequirementsFormField.CRAWLER
301 |             self.__form_selected_index = len(self.__crawler_group.radios)
302 |         elif self.__form_selected_field == RequirementsFormField.CRAWLER:
303 |             if self.__form_selected_index > 1:
304 |                 self.__form_selected_index -= 1
305 |             else:
306 |                 self.__form_selected_field = RequirementsFormField.DATASRC
307 |                 self.__form_selected_index = 0
308 | 
309 |     def __set_initial_crawler_selection(self, initial_crawler: str) -> None:
310 |         """
311 |         Set the initial crawler selection in the radio group.
312 |         
313 |         Args:
314 |             initial_crawler: The crawler type to initially select
315 |         """
316 |         if initial_crawler in VALID_CRAWLER_CHOICES:
317 |             crawler_index: int = VALID_CRAWLER_CHOICES.index(initial_crawler)
318 |             if 0 <= crawler_index < len(self.__crawler_group.radios):
319 |                 self.__crawler_group.radios[crawler_index].next_state()
320 | 
321 |     def __update_session(self) -> None:
322 |         """
323 |         Update the session with current form values.
324 |         """
325 |         # push a new app  configuration into the ui
326 |         selected_crawler: str = self.__crawler_group.value
327 |         self.session.set_init_input_args(selected_crawler, self.__datasrc_input.value)
328 |         if self.__validated:
329 |             try:
330 |                 crawl_model: BaseCrawler = get_crawler(selected_crawler)
331 |                 crawler: BaseCrawler = crawl_model(Path(self.__datasrc_input.value))
332 |                 self.session.set_init_crawler(crawler)
333 |                 sites_api: BaseJsonApi = self.session.crawler.get_sites_api()
334 |                 self.session.set_init_sites(sites_api.get_results())
335 |                 searchform: SearchFormView = SearchFormView(
336 |                     self.session,
337 |                     self.session.sites
338 |                 )
339 |                 self.session.set_init_searchform(searchform)
340 |             except Exception as ex:
341 |                 self.session.debug_add(f"Error initializing crawler: {ex}\n{traceback.format_exc()}")
342 |                 self.__validated = False
343 | 
344 |     def __validate(self, crawler: str, datasrc: str) -> bool:
345 |         """
346 |         Validate crawler and datasrc combination.
347 |         
348 |         Args:
349 |             crawler: The crawler type to validate
350 |             datasrc: The data source path to validate
351 |             
352 |         Returns:
353 |             bool: True if the combination is valid, False otherwise
354 |         """
355 |         if not isinstance(datasrc, str) or not isinstance(crawler, str):
356 |             return False
357 | 
358 |         crawler_valid: bool = crawler in VALID_CRAWLER_CHOICES
359 | 
360 |         if datasrc in (None, ""):
361 |             return False
362 | 
363 |         datasrc_path: Path = Path(datasrc)
364 |         if not datasrc_path.exists():
365 |             return False
366 | 
367 |         if crawler in ("interrobot", "warc"):
368 |             datasrc_valid = datasrc_path.is_file()
369 |         else:
370 |             datasrc_valid = datasrc_path.is_dir()
371 | 
372 |         return crawler_valid and datasrc_valid
373 | 
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/wget/tests.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.crawlers.wget.tests &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
 50 | </ul>
 51 | 
 52 |         </div>
 53 |       </div>
 54 |     </nav>
 55 | 
 56 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 57 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 58 |           <a href="../../../../index.html">mcp-server-webcrawl</a>
 59 |       </nav>
 60 | 
 61 |       <div class="wy-nav-content">
 62 |         <div class="rst-content">
 63 |           <div role="navigation" aria-label="Page navigation">
 64 |   <ul class="wy-breadcrumbs">
 65 |       <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 66 |           <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
 67 |           <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 68 |       <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.wget.tests</li>
 69 |       <li class="wy-breadcrumbs-aside">
 70 |       </li>
 71 |   </ul>
 72 |   <hr/>
 73 | </div>
 74 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 75 |            <div itemprop="articleBody">
 76 |              
 77 |   <h1>Source code for mcp_server_webcrawl.crawlers.wget.tests</h1><div class="highlight"><pre>
 78 | <span></span><span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
 79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.adapter</span> <span class="kn">import</span> <span class="n">WgetManager</span>
 80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
 81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
 82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 83 | 
 84 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 85 | 
 86 | <span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="n">WgetManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">&quot;example.com&quot;</span><span class="p">)</span>
 87 | <span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="n">WgetManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">&quot;pragmar.com&quot;</span><span class="p">)</span>
 88 | 
 89 | <div class="viewcode-block" id="WgetTests">
 90 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests">[docs]</a>
 91 | <span class="k">class</span> <span class="nc">WgetTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
 92 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 93 | <span class="sd">    Test suite for the wget crawler implementation.</span>
 94 | <span class="sd">    Uses all wrapped test methods from BaseCrawlerTests.</span>
 95 | <span class="sd">    &quot;&quot;&quot;</span>
 96 | 
 97 | <div class="viewcode-block" id="WgetTests.setUp">
 98 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.setUp">[docs]</a>
 99 |     <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
100 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
101 | <span class="sd">        Set up the test environment with fixture data.</span>
102 | <span class="sd">        &quot;&quot;&quot;</span>
103 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
104 |         <span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">&quot;wget&quot;</span></div>
105 | 
106 | 
107 | <div class="viewcode-block" id="WgetTests.test_wget_pulse">
108 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_pulse">[docs]</a>
109 |     <span class="k">def</span> <span class="nf">test_wget_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
110 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
111 | <span class="sd">        Test basic crawler initialization.</span>
112 | <span class="sd">        &quot;&quot;&quot;</span>
113 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
114 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
115 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
116 | 
117 | 
118 | <div class="viewcode-block" id="WgetTests.test_wget_sites">
119 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sites">[docs]</a>
120 |     <span class="k">def</span> <span class="nf">test_wget_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
121 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
122 | <span class="sd">        Test site retrieval API functionality.</span>
123 | <span class="sd">        &quot;&quot;&quot;</span>
124 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
125 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
126 | 
127 | 
128 | <div class="viewcode-block" id="WgetTests.test_wget_search">
129 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_search">[docs]</a>
130 |     <span class="k">def</span> <span class="nf">test_wget_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
131 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
132 | <span class="sd">        Test boolean search functionality</span>
133 | <span class="sd">        &quot;&quot;&quot;</span>
134 |         <span class="c1"># moved fixtures to own repo, lost some local media,</span>
135 |         <span class="c1"># but checks out. wget fixture has no CSS/JS/etc.</span>
136 |         <span class="c1"># HTML-only and just doesn&#39;t do well with the full array of</span>
137 |         <span class="c1"># tests concerning fulltext, media, and mixed search result</span>
138 |         <span class="c1"># counts. probably needs a reduced set of tests</span>
139 |         <span class="c1"># self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)</span>
140 |         <span class="k">return</span></div>
141 | 
142 | 
143 | <div class="viewcode-block" id="WgetTests.test_wget_resources">
144 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_resources">[docs]</a>
145 |     <span class="k">def</span> <span class="nf">test_wget_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
146 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
147 | <span class="sd">        Test resource retrieval API functionality with various parameters.</span>
148 | <span class="sd">        &quot;&quot;&quot;</span>
149 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
150 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
151 | 
152 | 
153 | 
154 | <div class="viewcode-block" id="WgetTests.test_wget_sorts">
155 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_sorts">[docs]</a>
156 |     <span class="k">def</span> <span class="nf">test_wget_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
157 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
158 | <span class="sd">        Test random sort functionality using the &#39;?&#39; sort parameter.</span>
159 | <span class="sd">        &quot;&quot;&quot;</span>
160 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
161 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
162 | 
163 | 
164 | <div class="viewcode-block" id="WgetTests.test_wget_content_parsing">
165 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_wget_content_parsing">[docs]</a>
166 |     <span class="k">def</span> <span class="nf">test_wget_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
167 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
168 | <span class="sd">        Test content type detection and parsing.</span>
169 | <span class="sd">        &quot;&quot;&quot;</span>
170 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
171 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
172 | 
173 | 
174 | <div class="viewcode-block" id="WgetTests.test_report">
175 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.wget.html#mcp_server_webcrawl.crawlers.wget.tests.WgetTests.test_report">[docs]</a>
176 |     <span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
177 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
178 | <span class="sd">        Run test report, save to data directory.</span>
179 | <span class="sd">        &quot;&quot;&quot;</span>
180 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WgetCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
181 |         <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">&quot;wget&quot;</span><span class="p">))</span></div>
182 | </div>
183 | 
184 | </pre></div>
185 | 
186 |            </div>
187 |           </div>
188 |           <footer>
189 | 
190 |   <hr/>
191 | 
192 |   <div role="contentinfo">
193 |     <p>&#169; Copyright 2025, pragmar.</p>
194 |   </div>
195 | 
196 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
197 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
198 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
199 |    
200 | 
201 | </footer>
202 |         </div>
203 |       </div>
204 |     </section>
205 |   </div>
206 |   <script>
207 |       jQuery(function () {
208 |           SphinxRtdTheme.Navigation.enable(true);
209 |       });
210 |   </script> 
211 | 
212 | </body>
213 | </html>
```

--------------------------------------------------------------------------------
/docs/_static/basic.css:
--------------------------------------------------------------------------------

```css
  1 | /*
  2 |  * basic.css
  3 |  * ~~~~~~~~~
  4 |  *
  5 |  * Sphinx stylesheet -- basic theme.
  6 |  *
  7 |  * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 | 
 12 | /* -- main layout ----------------------------------------------------------- */
 13 | 
 14 | div.clearer {
 15 |     clear: both;
 16 | }
 17 | 
 18 | div.section::after {
 19 |     display: block;
 20 |     content: '';
 21 |     clear: left;
 22 | }
 23 | 
 24 | /* -- relbar ---------------------------------------------------------------- */
 25 | 
 26 | div.related {
 27 |     width: 100%;
 28 |     font-size: 90%;
 29 | }
 30 | 
 31 | div.related h3 {
 32 |     display: none;
 33 | }
 34 | 
 35 | div.related ul {
 36 |     margin: 0;
 37 |     padding: 0 0 0 10px;
 38 |     list-style: none;
 39 | }
 40 | 
 41 | div.related li {
 42 |     display: inline;
 43 | }
 44 | 
 45 | div.related li.right {
 46 |     float: right;
 47 |     margin-right: 5px;
 48 | }
 49 | 
 50 | /* -- sidebar --------------------------------------------------------------- */
 51 | 
 52 | div.sphinxsidebarwrapper {
 53 |     padding: 10px 5px 0 10px;
 54 | }
 55 | 
 56 | div.sphinxsidebar {
 57 |     float: left;
 58 |     width: 230px;
 59 |     margin-left: -100%;
 60 |     font-size: 90%;
 61 |     word-wrap: break-word;
 62 |     overflow-wrap : break-word;
 63 | }
 64 | 
 65 | div.sphinxsidebar ul {
 66 |     list-style: none;
 67 | }
 68 | 
 69 | div.sphinxsidebar ul ul,
 70 | div.sphinxsidebar ul.want-points {
 71 |     margin-left: 20px;
 72 |     list-style: square;
 73 | }
 74 | 
 75 | div.sphinxsidebar ul ul {
 76 |     margin-top: 0;
 77 |     margin-bottom: 0;
 78 | }
 79 | 
 80 | div.sphinxsidebar form {
 81 |     margin-top: 10px;
 82 | }
 83 | 
 84 | div.sphinxsidebar input {
 85 |     border: 1px solid #98dbcc;
 86 |     font-family: sans-serif;
 87 |     font-size: 1em;
 88 | }
 89 | 
 90 | div.sphinxsidebar #searchbox form.search {
 91 |     overflow: hidden;
 92 | }
 93 | 
 94 | div.sphinxsidebar #searchbox input[type="text"] {
 95 |     float: left;
 96 |     width: 80%;
 97 |     padding: 0.25em;
 98 |     box-sizing: border-box;
 99 | }
100 | 
101 | div.sphinxsidebar #searchbox input[type="submit"] {
102 |     float: left;
103 |     width: 20%;
104 |     border-left: none;
105 |     padding: 0.25em;
106 |     box-sizing: border-box;
107 | }
108 | 
109 | 
110 | img {
111 |     border: 0;
112 |     max-width: 100%;
113 | }
114 | 
115 | /* -- search page ----------------------------------------------------------- */
116 | 
117 | ul.search {
118 |     margin: 10px 0 0 20px;
119 |     padding: 0;
120 | }
121 | 
122 | ul.search li {
123 |     padding: 5px 0 5px 20px;
124 |     background-image: url(file.png);
125 |     background-repeat: no-repeat;
126 |     background-position: 0 7px;
127 | }
128 | 
129 | ul.search li a {
130 |     font-weight: bold;
131 | }
132 | 
133 | ul.search li p.context {
134 |     color: #888;
135 |     margin: 2px 0 0 30px;
136 |     text-align: left;
137 | }
138 | 
139 | ul.keywordmatches li.goodmatch a {
140 |     font-weight: bold;
141 | }
142 | 
143 | /* -- index page ------------------------------------------------------------ */
144 | 
145 | table.contentstable {
146 |     width: 90%;
147 |     margin-left: auto;
148 |     margin-right: auto;
149 | }
150 | 
151 | table.contentstable p.biglink {
152 |     line-height: 150%;
153 | }
154 | 
155 | a.biglink {
156 |     font-size: 1.3em;
157 | }
158 | 
159 | span.linkdescr {
160 |     font-style: italic;
161 |     padding-top: 5px;
162 |     font-size: 90%;
163 | }
164 | 
165 | /* -- general index --------------------------------------------------------- */
166 | 
167 | table.indextable {
168 |     width: 100%;
169 | }
170 | 
171 | table.indextable td {
172 |     text-align: left;
173 |     vertical-align: top;
174 | }
175 | 
176 | table.indextable ul {
177 |     margin-top: 0;
178 |     margin-bottom: 0;
179 |     list-style-type: none;
180 | }
181 | 
182 | table.indextable > tbody > tr > td > ul {
183 |     padding-left: 0em;
184 | }
185 | 
186 | table.indextable tr.pcap {
187 |     height: 10px;
188 | }
189 | 
190 | table.indextable tr.cap {
191 |     margin-top: 10px;
192 |     background-color: #f2f2f2;
193 | }
194 | 
195 | img.toggler {
196 |     margin-right: 3px;
197 |     margin-top: 3px;
198 |     cursor: pointer;
199 | }
200 | 
201 | div.modindex-jumpbox {
202 |     border-top: 1px solid #ddd;
203 |     border-bottom: 1px solid #ddd;
204 |     margin: 1em 0 1em 0;
205 |     padding: 0.4em;
206 | }
207 | 
208 | div.genindex-jumpbox {
209 |     border-top: 1px solid #ddd;
210 |     border-bottom: 1px solid #ddd;
211 |     margin: 1em 0 1em 0;
212 |     padding: 0.4em;
213 | }
214 | 
215 | /* -- domain module index --------------------------------------------------- */
216 | 
217 | table.modindextable td {
218 |     padding: 2px;
219 |     border-collapse: collapse;
220 | }
221 | 
222 | /* -- general body styles --------------------------------------------------- */
223 | 
224 | div.body {
225 |     min-width: 360px;
226 |     max-width: 800px;
227 | }
228 | 
229 | div.body p, div.body dd, div.body li, div.body blockquote {
230 |     -moz-hyphens: auto;
231 |     -ms-hyphens: auto;
232 |     -webkit-hyphens: auto;
233 |     hyphens: auto;
234 | }
235 | 
236 | a.headerlink {
237 |     visibility: hidden;
238 | }
239 | 
240 | a:visited {
241 |     color: #551A8B;
242 | }
243 | 
244 | h1:hover > a.headerlink,
245 | h2:hover > a.headerlink,
246 | h3:hover > a.headerlink,
247 | h4:hover > a.headerlink,
248 | h5:hover > a.headerlink,
249 | h6:hover > a.headerlink,
250 | dt:hover > a.headerlink,
251 | caption:hover > a.headerlink,
252 | p.caption:hover > a.headerlink,
253 | div.code-block-caption:hover > a.headerlink {
254 |     visibility: visible;
255 | }
256 | 
257 | div.body p.caption {
258 |     text-align: inherit;
259 | }
260 | 
261 | div.body td {
262 |     text-align: left;
263 | }
264 | 
265 | .first {
266 |     margin-top: 0 !important;
267 | }
268 | 
269 | p.rubric {
270 |     margin-top: 30px;
271 |     font-weight: bold;
272 | }
273 | 
274 | img.align-left, figure.align-left, .figure.align-left, object.align-left {
275 |     clear: left;
276 |     float: left;
277 |     margin-right: 1em;
278 | }
279 | 
280 | img.align-right, figure.align-right, .figure.align-right, object.align-right {
281 |     clear: right;
282 |     float: right;
283 |     margin-left: 1em;
284 | }
285 | 
286 | img.align-center, figure.align-center, .figure.align-center, object.align-center {
287 |   display: block;
288 |   margin-left: auto;
289 |   margin-right: auto;
290 | }
291 | 
292 | img.align-default, figure.align-default, .figure.align-default {
293 |   display: block;
294 |   margin-left: auto;
295 |   margin-right: auto;
296 | }
297 | 
298 | .align-left {
299 |     text-align: left;
300 | }
301 | 
302 | .align-center {
303 |     text-align: center;
304 | }
305 | 
306 | .align-default {
307 |     text-align: center;
308 | }
309 | 
310 | .align-right {
311 |     text-align: right;
312 | }
313 | 
314 | /* -- sidebars -------------------------------------------------------------- */
315 | 
316 | div.sidebar,
317 | aside.sidebar {
318 |     margin: 0 0 0.5em 1em;
319 |     border: 1px solid #ddb;
320 |     padding: 7px;
321 |     background-color: #ffe;
322 |     width: 40%;
323 |     float: right;
324 |     clear: right;
325 |     overflow-x: auto;
326 | }
327 | 
328 | p.sidebar-title {
329 |     font-weight: bold;
330 | }
331 | 
332 | nav.contents,
333 | aside.topic,
334 | div.admonition, div.topic, blockquote {
335 |     clear: left;
336 | }
337 | 
338 | /* -- topics ---------------------------------------------------------------- */
339 | 
340 | nav.contents,
341 | aside.topic,
342 | div.topic {
343 |     border: 1px solid #ccc;
344 |     padding: 7px;
345 |     margin: 10px 0 10px 0;
346 | }
347 | 
348 | p.topic-title {
349 |     font-size: 1.1em;
350 |     font-weight: bold;
351 |     margin-top: 10px;
352 | }
353 | 
354 | /* -- admonitions ----------------------------------------------------------- */
355 | 
356 | div.admonition {
357 |     margin-top: 10px;
358 |     margin-bottom: 10px;
359 |     padding: 7px;
360 | }
361 | 
362 | div.admonition dt {
363 |     font-weight: bold;
364 | }
365 | 
366 | p.admonition-title {
367 |     margin: 0px 10px 5px 0px;
368 |     font-weight: bold;
369 | }
370 | 
371 | div.body p.centered {
372 |     text-align: center;
373 |     margin-top: 25px;
374 | }
375 | 
376 | /* -- content of sidebars/topics/admonitions -------------------------------- */
377 | 
378 | div.sidebar > :last-child,
379 | aside.sidebar > :last-child,
380 | nav.contents > :last-child,
381 | aside.topic > :last-child,
382 | div.topic > :last-child,
383 | div.admonition > :last-child {
384 |     margin-bottom: 0;
385 | }
386 | 
387 | div.sidebar::after,
388 | aside.sidebar::after,
389 | nav.contents::after,
390 | aside.topic::after,
391 | div.topic::after,
392 | div.admonition::after,
393 | blockquote::after {
394 |     display: block;
395 |     content: '';
396 |     clear: both;
397 | }
398 | 
399 | /* -- tables ---------------------------------------------------------------- */
400 | 
401 | table.docutils {
402 |     margin-top: 10px;
403 |     margin-bottom: 10px;
404 |     border: 0;
405 |     border-collapse: collapse;
406 | }
407 | 
408 | table.align-center {
409 |     margin-left: auto;
410 |     margin-right: auto;
411 | }
412 | 
413 | table.align-default {
414 |     margin-left: auto;
415 |     margin-right: auto;
416 | }
417 | 
418 | table caption span.caption-number {
419 |     font-style: italic;
420 | }
421 | 
422 | table caption span.caption-text {
423 | }
424 | 
425 | table.docutils td, table.docutils th {
426 |     padding: 1px 8px 1px 5px;
427 |     border-top: 0;
428 |     border-left: 0;
429 |     border-right: 0;
430 |     border-bottom: 1px solid #aaa;
431 | }
432 | 
433 | th {
434 |     text-align: left;
435 |     padding-right: 5px;
436 | }
437 | 
438 | table.citation {
439 |     border-left: solid 1px gray;
440 |     margin-left: 1px;
441 | }
442 | 
443 | table.citation td {
444 |     border-bottom: none;
445 | }
446 | 
447 | th > :first-child,
448 | td > :first-child {
449 |     margin-top: 0px;
450 | }
451 | 
452 | th > :last-child,
453 | td > :last-child {
454 |     margin-bottom: 0px;
455 | }
456 | 
457 | /* -- figures --------------------------------------------------------------- */
458 | 
459 | div.figure, figure {
460 |     margin: 0.5em;
461 |     padding: 0.5em;
462 | }
463 | 
464 | div.figure p.caption, figcaption {
465 |     padding: 0.3em;
466 | }
467 | 
468 | div.figure p.caption span.caption-number,
469 | figcaption span.caption-number {
470 |     font-style: italic;
471 | }
472 | 
473 | div.figure p.caption span.caption-text,
474 | figcaption span.caption-text {
475 | }
476 | 
477 | /* -- field list styles ----------------------------------------------------- */
478 | 
479 | table.field-list td, table.field-list th {
480 |     border: 0 !important;
481 | }
482 | 
483 | .field-list ul {
484 |     margin: 0;
485 |     padding-left: 1em;
486 | }
487 | 
488 | .field-list p {
489 |     margin: 0;
490 | }
491 | 
492 | .field-name {
493 |     -moz-hyphens: manual;
494 |     -ms-hyphens: manual;
495 |     -webkit-hyphens: manual;
496 |     hyphens: manual;
497 | }
498 | 
499 | /* -- hlist styles ---------------------------------------------------------- */
500 | 
501 | table.hlist {
502 |     margin: 1em 0;
503 | }
504 | 
505 | table.hlist td {
506 |     vertical-align: top;
507 | }
508 | 
509 | /* -- object description styles --------------------------------------------- */
510 | 
511 | .sig {
512 | 	font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
513 | }
514 | 
515 | .sig-name, code.descname {
516 |     background-color: transparent;
517 |     font-weight: bold;
518 | }
519 | 
520 | .sig-name {
521 | 	font-size: 1.1em;
522 | }
523 | 
524 | code.descname {
525 |     font-size: 1.2em;
526 | }
527 | 
528 | .sig-prename, code.descclassname {
529 |     background-color: transparent;
530 | }
531 | 
532 | .optional {
533 |     font-size: 1.3em;
534 | }
535 | 
536 | .sig-paren {
537 |     font-size: larger;
538 | }
539 | 
540 | .sig-param.n {
541 | 	font-style: italic;
542 | }
543 | 
544 | /* C++ specific styling */
545 | 
546 | .sig-inline.c-texpr,
547 | .sig-inline.cpp-texpr {
548 | 	font-family: unset;
549 | }
550 | 
551 | .sig.c   .k, .sig.c   .kt,
552 | .sig.cpp .k, .sig.cpp .kt {
553 | 	color: #0033B3;
554 | }
555 | 
556 | .sig.c   .m,
557 | .sig.cpp .m {
558 | 	color: #1750EB;
559 | }
560 | 
561 | .sig.c   .s, .sig.c   .sc,
562 | .sig.cpp .s, .sig.cpp .sc {
563 | 	color: #067D17;
564 | }
565 | 
566 | 
567 | /* -- other body styles ----------------------------------------------------- */
568 | 
569 | ol.arabic {
570 |     list-style: decimal;
571 | }
572 | 
573 | ol.loweralpha {
574 |     list-style: lower-alpha;
575 | }
576 | 
577 | ol.upperalpha {
578 |     list-style: upper-alpha;
579 | }
580 | 
581 | ol.lowerroman {
582 |     list-style: lower-roman;
583 | }
584 | 
585 | ol.upperroman {
586 |     list-style: upper-roman;
587 | }
588 | 
589 | :not(li) > ol > li:first-child > :first-child,
590 | :not(li) > ul > li:first-child > :first-child {
591 |     margin-top: 0px;
592 | }
593 | 
594 | :not(li) > ol > li:last-child > :last-child,
595 | :not(li) > ul > li:last-child > :last-child {
596 |     margin-bottom: 0px;
597 | }
598 | 
599 | ol.simple ol p,
600 | ol.simple ul p,
601 | ul.simple ol p,
602 | ul.simple ul p {
603 |     margin-top: 0;
604 | }
605 | 
606 | ol.simple > li:not(:first-child) > p,
607 | ul.simple > li:not(:first-child) > p {
608 |     margin-top: 0;
609 | }
610 | 
611 | ol.simple p,
612 | ul.simple p {
613 |     margin-bottom: 0;
614 | }
615 | 
616 | aside.footnote > span,
617 | div.citation > span {
618 |     float: left;
619 | }
620 | aside.footnote > span:last-of-type,
621 | div.citation > span:last-of-type {
622 |   padding-right: 0.5em;
623 | }
624 | aside.footnote > p {
625 |   margin-left: 2em;
626 | }
627 | div.citation > p {
628 |   margin-left: 4em;
629 | }
630 | aside.footnote > p:last-of-type,
631 | div.citation > p:last-of-type {
632 |     margin-bottom: 0em;
633 | }
634 | aside.footnote > p:last-of-type:after,
635 | div.citation > p:last-of-type:after {
636 |     content: "";
637 |     clear: both;
638 | }
639 | 
640 | dl.field-list {
641 |     display: grid;
642 |     grid-template-columns: fit-content(30%) auto;
643 | }
644 | 
645 | dl.field-list > dt {
646 |     font-weight: bold;
647 |     word-break: break-word;
648 |     padding-left: 0.5em;
649 |     padding-right: 5px;
650 | }
651 | 
652 | dl.field-list > dd {
653 |     padding-left: 0.5em;
654 |     margin-top: 0em;
655 |     margin-left: 0em;
656 |     margin-bottom: 0em;
657 | }
658 | 
659 | dl {
660 |     margin-bottom: 15px;
661 | }
662 | 
663 | dd > :first-child {
664 |     margin-top: 0px;
665 | }
666 | 
667 | dd ul, dd table {
668 |     margin-bottom: 10px;
669 | }
670 | 
671 | dd {
672 |     margin-top: 3px;
673 |     margin-bottom: 10px;
674 |     margin-left: 30px;
675 | }
676 | 
677 | .sig dd {
678 |     margin-top: 0px;
679 |     margin-bottom: 0px;
680 | }
681 | 
682 | .sig dl {
683 |     margin-top: 0px;
684 |     margin-bottom: 0px;
685 | }
686 | 
687 | dl > dd:last-child,
688 | dl > dd:last-child > :last-child {
689 |     margin-bottom: 0;
690 | }
691 | 
692 | dt:target, span.highlighted {
693 |     background-color: #fbe54e;
694 | }
695 | 
696 | rect.highlighted {
697 |     fill: #fbe54e;
698 | }
699 | 
700 | dl.glossary dt {
701 |     font-weight: bold;
702 |     font-size: 1.1em;
703 | }
704 | 
705 | .versionmodified {
706 |     font-style: italic;
707 | }
708 | 
709 | .system-message {
710 |     background-color: #fda;
711 |     padding: 5px;
712 |     border: 3px solid red;
713 | }
714 | 
715 | .footnote:target  {
716 |     background-color: #ffa;
717 | }
718 | 
719 | .line-block {
720 |     display: block;
721 |     margin-top: 1em;
722 |     margin-bottom: 1em;
723 | }
724 | 
725 | .line-block .line-block {
726 |     margin-top: 0;
727 |     margin-bottom: 0;
728 |     margin-left: 1.5em;
729 | }
730 | 
731 | .guilabel, .menuselection {
732 |     font-family: sans-serif;
733 | }
734 | 
735 | .accelerator {
736 |     text-decoration: underline;
737 | }
738 | 
739 | .classifier {
740 |     font-style: oblique;
741 | }
742 | 
743 | .classifier:before {
744 |     font-style: normal;
745 |     margin: 0 0.5em;
746 |     content: ":";
747 |     display: inline-block;
748 | }
749 | 
750 | abbr, acronym {
751 |     border-bottom: dotted 1px;
752 |     cursor: help;
753 | }
754 | 
755 | .translated {
756 |     background-color: rgba(207, 255, 207, 0.2)
757 | }
758 | 
759 | .untranslated {
760 |     background-color: rgba(255, 207, 207, 0.2)
761 | }
762 | 
763 | /* -- code displays --------------------------------------------------------- */
764 | 
765 | pre {
766 |     overflow: auto;
767 |     overflow-y: hidden;  /* fixes display issues on Chrome browsers */
768 | }
769 | 
770 | pre, div[class*="highlight-"] {
771 |     clear: both;
772 | }
773 | 
774 | span.pre {
775 |     -moz-hyphens: none;
776 |     -ms-hyphens: none;
777 |     -webkit-hyphens: none;
778 |     hyphens: none;
779 |     white-space: nowrap;
780 | }
781 | 
782 | div[class*="highlight-"] {
783 |     margin: 1em 0;
784 | }
785 | 
786 | td.linenos pre {
787 |     border: 0;
788 |     background-color: transparent;
789 |     color: #aaa;
790 | }
791 | 
792 | table.highlighttable {
793 |     display: block;
794 | }
795 | 
796 | table.highlighttable tbody {
797 |     display: block;
798 | }
799 | 
800 | table.highlighttable tr {
801 |     display: flex;
802 | }
803 | 
804 | table.highlighttable td {
805 |     margin: 0;
806 |     padding: 0;
807 | }
808 | 
809 | table.highlighttable td.linenos {
810 |     padding-right: 0.5em;
811 | }
812 | 
813 | table.highlighttable td.code {
814 |     flex: 1;
815 |     overflow: hidden;
816 | }
817 | 
818 | .highlight .hll {
819 |     display: block;
820 | }
821 | 
822 | div.highlight pre,
823 | table.highlighttable pre {
824 |     margin: 0;
825 | }
826 | 
827 | div.code-block-caption + div {
828 |     margin-top: 0;
829 | }
830 | 
831 | div.code-block-caption {
832 |     margin-top: 1em;
833 |     padding: 2px 5px;
834 |     font-size: small;
835 | }
836 | 
837 | div.code-block-caption code {
838 |     background-color: transparent;
839 | }
840 | 
841 | table.highlighttable td.linenos,
842 | span.linenos,
843 | div.highlight span.gp {  /* gp: Generic.Prompt */
844 |   user-select: none;
845 |   -webkit-user-select: text; /* Safari fallback only */
846 |   -webkit-user-select: none; /* Chrome/Safari */
847 |   -moz-user-select: none; /* Firefox */
848 |   -ms-user-select: none; /* IE10+ */
849 | }
850 | 
851 | div.code-block-caption span.caption-number {
852 |     padding: 0.1em 0.3em;
853 |     font-style: italic;
854 | }
855 | 
856 | div.code-block-caption span.caption-text {
857 | }
858 | 
859 | div.literal-block-wrapper {
860 |     margin: 1em 0;
861 | }
862 | 
863 | code.xref, a code {
864 |     background-color: transparent;
865 |     font-weight: bold;
866 | }
867 | 
868 | h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
869 |     background-color: transparent;
870 | }
871 | 
872 | .viewcode-link {
873 |     float: right;
874 | }
875 | 
876 | .viewcode-back {
877 |     float: right;
878 |     font-family: sans-serif;
879 | }
880 | 
881 | div.viewcode-block:target {
882 |     margin: -1px -10px;
883 |     padding: 0 10px;
884 | }
885 | 
886 | /* -- math display ---------------------------------------------------------- */
887 | 
888 | img.math {
889 |     vertical-align: middle;
890 | }
891 | 
892 | div.body div.math p {
893 |     text-align: center;
894 | }
895 | 
896 | span.eqno {
897 |     float: right;
898 | }
899 | 
900 | span.eqno a.headerlink {
901 |     position: absolute;
902 |     z-index: 1;
903 | }
904 | 
905 | div.math:hover a.headerlink {
906 |     visibility: visible;
907 | }
908 | 
909 | /* -- printout stylesheet --------------------------------------------------- */
910 | 
911 | @media print {
912 |     div.document,
913 |     div.documentwrapper,
914 |     div.bodywrapper {
915 |         margin: 0 !important;
916 |         width: 100%;
917 |     }
918 | 
919 |     div.sphinxsidebar,
920 |     div.related,
921 |     div.footer,
922 |     #top-link {
923 |         display: none;
924 |     }
925 | }
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/models/sites.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.models.sites &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../interactive.html">Interactive Mode</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
 50 | </ul>
 51 | 
 52 |         </div>
 53 |       </div>
 54 |     </nav>
 55 | 
 56 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 57 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 58 |           <a href="../../../index.html">mcp-server-webcrawl</a>
 59 |       </nav>
 60 | 
 61 |       <div class="wy-nav-content">
 62 |         <div class="rst-content">
 63 |           <div role="navigation" aria-label="Page navigation">
 64 |   <ul class="wy-breadcrumbs">
 65 |       <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 66 |           <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
 67 |       <li class="breadcrumb-item active">mcp_server_webcrawl.models.sites</li>
 68 |       <li class="wy-breadcrumbs-aside">
 69 |       </li>
 70 |   </ul>
 71 |   <hr/>
 72 | </div>
 73 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 74 |            <div itemprop="articleBody">
 75 |              
 76 |   <h1>Source code for mcp_server_webcrawl.models.sites</h1><div class="highlight"><pre>
 77 | <span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
 78 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
 79 | <span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
 80 | <span class="kn">from</span> <span class="nn">enum</span> <span class="kn">import</span> <span class="n">Enum</span>
 81 | 
 82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.base</span> <span class="kn">import</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span>
 83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils</span> <span class="kn">import</span> <span class="n">to_isoformat_zulu</span>
 84 | 
 85 | <div class="viewcode-block" id="SiteType">
 86 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteType">[docs]</a>
 87 | <span class="k">class</span> <span class="nc">SiteType</span><span class="p">(</span><span class="n">Enum</span><span class="p">):</span>
 88 |     <span class="n">UNDEFINED</span> <span class="o">=</span> <span class="s2">&quot;undefined&quot;</span>
 89 |     <span class="n">CRAWLED_URL</span> <span class="o">=</span> <span class="s2">&quot;url&quot;</span>
 90 |     <span class="n">CRAWLED_LIST</span> <span class="o">=</span> <span class="s2">&quot;list&quot;</span></div>
 91 | 
 92 | 
 93 | <span class="n">SITES_TOOL_NAME</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;webcrawl_sites&quot;</span>
 94 | <span class="n">SITES_FIELDS_BASE</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;type&quot;</span><span class="p">,</span> <span class="s2">&quot;urls&quot;</span><span class="p">]</span>
 95 | <span class="n">SITES_FIELDS_DEFAULT</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">SITES_FIELDS_BASE</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;created&quot;</span><span class="p">,</span> <span class="s2">&quot;modified&quot;</span><span class="p">]</span>
 96 | 
 97 | <div class="viewcode-block" id="SiteResult">
 98 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult">[docs]</a>
 99 | <span class="k">class</span> <span class="nc">SiteResult</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
100 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
101 | <span class="sd">    Represents a website or crawl directory result.</span>
102 | <span class="sd">    &quot;&quot;&quot;</span>
103 | 
104 | <div class="viewcode-block" id="SiteResult.__init__">
105 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult.__init__">[docs]</a>
106 |     <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
107 |         <span class="bp">self</span><span class="p">,</span>
108 |         <span class="nb">id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
109 |         <span class="n">name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
110 |         <span class="nb">type</span><span class="p">:</span> <span class="n">SiteType</span> <span class="o">=</span> <span class="n">SiteType</span><span class="o">.</span><span class="n">CRAWLED_URL</span><span class="p">,</span>
111 |         <span class="n">urls</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
112 |         <span class="n">path</span><span class="p">:</span> <span class="n">Path</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
113 |         <span class="n">created</span><span class="p">:</span> <span class="n">datetime</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
114 |         <span class="n">modified</span><span class="p">:</span> <span class="n">datetime</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
115 |         <span class="n">robots</span><span class="p">:</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
116 |         <span class="n">metadata</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
117 |     <span class="p">):</span>
118 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
119 | <span class="sd">        Initialize a SiteResult instance.</span>
120 | 
121 | <span class="sd">        Args:</span>
122 | <span class="sd">            id: site identifier</span>
123 | <span class="sd">            name: site name, either a URL or a custom job</span>
124 | <span class="sd">            urls: site URL(s), multiple for list type crawls</span>
125 | <span class="sd">            path: path to site data, different from datasrc</span>
126 | <span class="sd">            created: creation timestamp</span>
127 | <span class="sd">            modified: last modification timestamp</span>
128 | <span class="sd">            robots: robots.txt content</span>
129 | <span class="sd">            metadata: additional metadata for the site</span>
130 | <span class="sd">        &quot;&quot;&quot;</span>
131 |         <span class="bp">self</span><span class="o">.</span><span class="n">id</span> <span class="o">=</span> <span class="nb">id</span>
132 |         <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
133 |         <span class="bp">self</span><span class="o">.</span><span class="n">type</span> <span class="o">=</span> <span class="nb">type</span>
134 |         <span class="bp">self</span><span class="o">.</span><span class="n">urls</span> <span class="o">=</span> <span class="n">urls</span>
135 |         <span class="bp">self</span><span class="o">.</span><span class="n">path</span> <span class="o">=</span> <span class="n">path</span>
136 |         <span class="bp">self</span><span class="o">.</span><span class="n">created</span> <span class="o">=</span> <span class="n">created</span>
137 |         <span class="bp">self</span><span class="o">.</span><span class="n">modified</span> <span class="o">=</span> <span class="n">modified</span>
138 |         <span class="bp">self</span><span class="o">.</span><span class="n">robots</span> <span class="o">=</span> <span class="n">robots</span>
139 |         <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="o">=</span> <span class="n">metadata</span> <span class="ow">or</span> <span class="p">{}</span></div>
140 | 
141 | 
142 | <div class="viewcode-block" id="SiteResult.to_dict">
143 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult.to_dict">[docs]</a>
144 |     <span class="k">def</span> <span class="nf">to_dict</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]:</span>
145 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
146 | <span class="sd">        Convert the object to a dictionary suitable for JSON serialization.</span>
147 | <span class="sd">        &quot;&quot;&quot;</span>
148 |         <span class="n">result</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">METADATA_VALUE_TYPE</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
149 |             <span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
150 |             <span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
151 |             <span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">value</span><span class="p">,</span>
152 |             <span class="s2">&quot;urls&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">urls</span><span class="p">,</span>
153 |             <span class="s2">&quot;created&quot;</span><span class="p">:</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">created</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">created</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
154 |             <span class="s2">&quot;modified&quot;</span><span class="p">:</span> <span class="n">to_isoformat_zulu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">modified</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">modified</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
155 |             <span class="s2">&quot;metadata&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
156 |         <span class="p">}</span>
157 | 
158 |         <span class="k">return</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">v</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="p">(</span><span class="n">k</span> <span class="o">==</span> <span class="s2">&quot;metadata&quot;</span> <span class="ow">and</span> <span class="n">v</span> <span class="o">==</span> <span class="p">{})}</span></div>
159 | </div>
160 | 
161 | </pre></div>
162 | 
163 |            </div>
164 |           </div>
165 |           <footer>
166 | 
167 |   <hr/>
168 | 
169 |   <div role="contentinfo">
170 |     <p>&#169; Copyright 2025, pragmar.</p>
171 |   </div>
172 | 
173 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
174 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
175 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
176 |    
177 | 
178 | </footer>
179 |         </div>
180 |       </div>
181 |     </section>
182 |   </div>
183 |   <script>
184 |       jQuery(function () {
185 |           SphinxRtdTheme.Navigation.enable(true);
186 |       });
187 |   </script> 
188 | 
189 | </body>
190 | </html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/warc/tests.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.crawlers.warc.tests &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="../../../../index.html">mcp-server-webcrawl</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 65 |           <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
 66 |           <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 67 |       <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.warc.tests</li>
 68 |       <li class="wy-breadcrumbs-aside">
 69 |       </li>
 70 |   </ul>
 71 |   <hr/>
 72 | </div>
 73 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 74 |            <div itemprop="articleBody">
 75 |              
 76 |   <h1>Source code for mcp_server_webcrawl.crawlers.warc.tests</h1><div class="highlight"><pre>
 77 | <span></span><span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.crawler</span> <span class="kn">import</span> <span class="n">WarcCrawler</span>
 78 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.adapter</span> <span class="kn">import</span> <span class="n">WarcManager</span>
 79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
 80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
 81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 82 | 
 83 | <span class="n">EXAMPLE_WARC_ID</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">WarcManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">&quot;example.warc.gz&quot;</span><span class="p">)</span>
 84 | <span class="n">PRAGMAR_WARC_ID</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">WarcManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">&quot;pragmar.warc.gz&quot;</span><span class="p">)</span>
 85 | 
 86 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 87 | 
 88 | <div class="viewcode-block" id="WarcTests">
 89 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests">[docs]</a>
 90 | <span class="k">class</span> <span class="nc">WarcTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
 91 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 92 | <span class="sd">    Test suite for the WARC crawler implementation.</span>
 93 | <span class="sd">    Uses all wrapped test methods from BaseCrawlerTests.</span>
 94 | <span class="sd">    &quot;&quot;&quot;</span>
 95 | 
 96 | <div class="viewcode-block" id="WarcTests.setUp">
 97 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.setUp">[docs]</a>
 98 |     <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
 99 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
100 | <span class="sd">        Set up the test environment with fixture data.</span>
101 | <span class="sd">        &quot;&quot;&quot;</span>
102 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
103 |         <span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">&quot;warc&quot;</span></div>
104 | 
105 | 
106 | <div class="viewcode-block" id="WarcTests.test_warc_pulse">
107 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_pulse">[docs]</a>
108 |     <span class="k">def</span> <span class="nf">test_warc_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
109 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
110 | <span class="sd">        Test basic crawler initialization.</span>
111 | <span class="sd">        &quot;&quot;&quot;</span>
112 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
113 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
114 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
115 | 
116 | 
117 | <div class="viewcode-block" id="WarcTests.test_warc_sites">
118 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sites">[docs]</a>
119 |     <span class="k">def</span> <span class="nf">test_warc_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
120 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
121 | <span class="sd">        Test site retrieval API functionality.</span>
122 | <span class="sd">        &quot;&quot;&quot;</span>
123 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
124 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
125 | 
126 | 
127 | <div class="viewcode-block" id="WarcTests.test_warc_search">
128 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_search">[docs]</a>
129 |     <span class="k">def</span> <span class="nf">test_warc_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
130 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
131 | <span class="sd">        Test boolean search functionality</span>
132 | <span class="sd">        &quot;&quot;&quot;</span>
133 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
134 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
135 | 
136 | 
137 | <div class="viewcode-block" id="WarcTests.test_warc_resources">
138 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_resources">[docs]</a>
139 |     <span class="k">def</span> <span class="nf">test_warc_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
140 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
141 | <span class="sd">        Test resource retrieval API functionality with various parameters.</span>
142 | <span class="sd">        &quot;&quot;&quot;</span>
143 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
144 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="n">EXAMPLE_WARC_ID</span><span class="p">)</span></div>
145 | 
146 | 
147 |     <span class="c1"># pragmar WARC fixture legit contains no images</span>
148 |     <span class="c1"># may be default behavior of wget WARC gen, not sure</span>
149 |     <span class="c1"># this is a blind spot</span>
150 |     <span class="c1"># def test_interrobot_images(self):</span>
151 |     <span class="c1">#     &quot;&quot;&quot;</span>
152 |     <span class="c1">#     Test InterroBot-specific image handling and thumbnails.</span>
153 |     <span class="c1">#     &quot;&quot;&quot;</span>
154 |     <span class="c1">#     crawler = WarcCrawler(self._datasrc)</span>
155 |     <span class="c1">#     self.run_pragmar_image_tests(crawler, PRAGMAR_WARC_ID)</span>
156 | 
157 | <div class="viewcode-block" id="WarcTests.test_warc_sorts">
158 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_sorts">[docs]</a>
159 |     <span class="k">def</span> <span class="nf">test_warc_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
160 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
161 | <span class="sd">        Test random sort functionality using the &#39;?&#39; sort parameter.</span>
162 | <span class="sd">        &quot;&quot;&quot;</span>
163 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
164 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">)</span></div>
165 | 
166 | 
167 | <div class="viewcode-block" id="WarcTests.test_warc_content_parsing">
168 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_warc_content_parsing">[docs]</a>
169 |     <span class="k">def</span> <span class="nf">test_warc_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
170 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
171 | <span class="sd">        Test content type detection and parsing for WARC files.</span>
172 | <span class="sd">        &quot;&quot;&quot;</span>
173 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
174 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></div>
175 | 
176 | 
177 | <div class="viewcode-block" id="WarcTests.test_report">
178 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.warc.html#mcp_server_webcrawl.crawlers.warc.tests.WarcTests.test_report">[docs]</a>
179 |     <span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
180 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
181 | <span class="sd">        Run test report, save to data directory.</span>
182 | <span class="sd">        &quot;&quot;&quot;</span>
183 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">WarcCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
184 |         <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_WARC_ID</span><span class="p">,</span> <span class="s2">&quot;WARC&quot;</span><span class="p">))</span></div>
185 | </div>
186 | 
187 | </pre></div>
188 | 
189 |            </div>
190 |           </div>
191 |           <footer>
192 | 
193 |   <hr/>
194 | 
195 |   <div role="contentinfo">
196 |     <p>&#169; Copyright 2025, pragmar.</p>
197 |   </div>
198 | 
199 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
200 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
201 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
202 |    
203 | 
204 | </footer>
205 |         </div>
206 |       </div>
207 |     </section>
208 |   </div>
209 |   <script>
210 |       jQuery(function () {
211 |           SphinxRtdTheme.Navigation.enable(true);
212 |       });
213 |   </script> 
214 | 
215 | </body>
216 | </html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/logger.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.utils.logger &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../interactive.html">Interactive Mode</a></li>
 49 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
 50 | </ul>
 51 | 
 52 |         </div>
 53 |       </div>
 54 |     </nav>
 55 | 
 56 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 57 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 58 |           <a href="../../../index.html">mcp-server-webcrawl</a>
 59 |       </nav>
 60 | 
 61 |       <div class="wy-nav-content">
 62 |         <div class="rst-content">
 63 |           <div role="navigation" aria-label="Page navigation">
 64 |   <ul class="wy-breadcrumbs">
 65 |       <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 66 |           <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
 67 |           <li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
 68 |       <li class="breadcrumb-item active">mcp_server_webcrawl.utils.logger</li>
 69 |       <li class="wy-breadcrumbs-aside">
 70 |       </li>
 71 |   </ul>
 72 |   <hr/>
 73 | </div>
 74 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 75 |            <div itemprop="articleBody">
 76 |              
 77 |   <h1>Source code for mcp_server_webcrawl.utils.logger</h1><div class="highlight"><pre>
 78 | <span></span><span class="kn">import</span> <span class="nn">logging</span>
 79 | <span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
 80 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
 81 | 
 82 | <span class="kn">import</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="k">as</span> <span class="nn">settings</span>
 83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="kn">import</span> <span class="n">DEBUG</span><span class="p">,</span> <span class="n">DATA_DIRECTORY</span>
 84 | 
 85 | <span class="n">DEFAULT_LOG_KEY</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;mcp-server-webcrawl&quot;</span>
 86 | <span class="n">DEFAULT_LOG_PATH</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="n">DATA_DIRECTORY</span> <span class="o">/</span> <span class="s2">&quot;mcp-server-webcrawl.log&quot;</span>
 87 | <span class="n">DEFAULT_LOG_LEVEL</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">WARNING</span>
 88 | 
 89 | <div class="viewcode-block" id="get_logger_configuration">
 90 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.get_logger_configuration">[docs]</a>
 91 | <span class="k">def</span> <span class="nf">get_logger_configuration</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span>
 92 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 93 | <span class="sd">    Get log name, path, and level (in that order)</span>
 94 | 
 95 | <span class="sd">    Returns:</span>
 96 | <span class="sd">        tuple[str, Path, int]: A tuple containing name, path, and level</span>
 97 | <span class="sd">    &quot;&quot;&quot;</span>
 98 | 
 99 |     <span class="n">log_level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">DEBUG</span> <span class="k">if</span> <span class="n">DEBUG</span> <span class="k">else</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">settings</span><span class="p">,</span> <span class="s2">&quot;LOG_LEVEL&quot;</span><span class="p">,</span> <span class="n">DEFAULT_LOG_LEVEL</span><span class="p">)</span>
100 |     <span class="n">log_path</span><span class="p">:</span> <span class="n">Path</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">settings</span><span class="p">,</span> <span class="s2">&quot;LOG_PATH&quot;</span><span class="p">,</span> <span class="n">DEFAULT_LOG_PATH</span><span class="p">)</span>
101 | 
102 |     <span class="k">return</span> <span class="p">(</span><span class="n">DEFAULT_LOG_KEY</span><span class="p">,</span> <span class="n">log_path</span><span class="p">,</span> <span class="n">log_level</span><span class="p">)</span></div>
103 | 
104 | 
105 | <div class="viewcode-block" id="get_logger">
106 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.get_logger">[docs]</a>
107 | <span class="k">def</span> <span class="nf">get_logger</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">logging</span><span class="o">.</span><span class="n">Logger</span><span class="p">:</span>
108 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
109 | <span class="sd">    Get logger, usually in order to write to it</span>
110 | 
111 | <span class="sd">    Returns:</span>
112 | <span class="sd">        Logger: a writable logging object (error/warn/info/debug)</span>
113 | <span class="sd">    &quot;&quot;&quot;</span>
114 | 
115 |     <span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">)</span> <span class="o">=</span> <span class="n">get_logger_configuration</span><span class="p">()</span>
116 |     <span class="k">return</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">log_name</span><span class="p">)</span></div>
117 | 
118 | 
119 | <div class="viewcode-block" id="initialize_logger">
120 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.logger.initialize_logger">[docs]</a>
121 | <span class="k">def</span> <span class="nf">initialize_logger</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
122 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
123 | <span class="sd">    Validate and set up logger for writing</span>
124 | 
125 | <span class="sd">    Returns:</span>
126 | <span class="sd">        None</span>
127 | <span class="sd">    &quot;&quot;&quot;</span>
128 | 
129 |     <span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="n">log_path</span><span class="p">,</span> <span class="n">log_level</span><span class="p">)</span> <span class="o">=</span> <span class="n">get_logger_configuration</span><span class="p">()</span>
130 |     <span class="k">if</span> <span class="n">log_level</span> <span class="o">==</span> <span class="n">logging</span><span class="o">.</span><span class="n">NOTSET</span><span class="p">:</span>
131 |         <span class="c1"># don&#39;t set up anything, named logging will effectively evaporate</span>
132 |         <span class="k">return</span>
133 | 
134 |     <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_level</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">and</span> <span class="n">log_level</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">&quot;LOG_LEVEL must be set&quot;</span>
135 |     <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_path</span><span class="p">,</span> <span class="n">Path</span><span class="p">),</span> <span class="s2">&quot;LOG_PATH must be a Path object&quot;</span>
136 |     <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">log_name</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="n">log_name</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="o">!=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="s2">&quot;LOG_NAME must be a non-empty string&quot;</span>
137 |     <span class="k">assert</span> <span class="nb">all</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">isalpha</span><span class="p">()</span> <span class="ow">or</span> <span class="n">c</span> <span class="ow">in</span> <span class="s2">&quot;-_&quot;</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">log_name</span><span class="p">),</span> <span class="s2">&quot;LOG_NAME must contain only A-Z, a-z, hyphens, and underscores&quot;</span>
138 | 
139 |     <span class="c1"># handle custom log paths differently, don&#39;t generate directories</span>
140 |     <span class="k">if</span> <span class="s2">&quot;.mcp_server_webcrawl&quot;</span> <span class="ow">in</span> <span class="nb">str</span><span class="p">(</span><span class="n">log_path</span><span class="p">):</span>
141 |         <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
142 |     <span class="k">else</span><span class="p">:</span>
143 |         <span class="k">assert</span> <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">exists</span><span class="p">()</span> <span class="ow">and</span> <span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> \
144 |             <span class="sa">f</span><span class="s2">&quot;Custom parent directory `</span><span class="si">{</span><span class="n">log_path</span><span class="o">.</span><span class="n">parent</span><span class="si">}</span><span class="s2">` does not exist or is not a directory&quot;</span>
145 | 
146 |     <span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">filename</span><span class="o">=</span><span class="nb">str</span><span class="p">(</span><span class="n">log_path</span><span class="p">),</span> <span class="n">filemode</span><span class="o">=</span><span class="s2">&quot;w&quot;</span><span class="p">,</span> <span class="n">level</span><span class="o">=</span><span class="n">log_level</span><span class="p">,</span>
147 |         <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;</span><span class="si">%(asctime)s</span><span class="s2"> - </span><span class="si">%(name)s</span><span class="s2"> - </span><span class="si">%(levelname)s</span><span class="s2"> - </span><span class="si">%(message)s</span><span class="s2">&quot;</span><span class="p">,</span>
148 |         <span class="n">datefmt</span><span class="o">=</span><span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2"> %H:%M:%S&quot;</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span>
149 | 
150 |     <span class="n">logger</span><span class="p">:</span> <span class="n">logging</span><span class="o">.</span><span class="n">Logger</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="n">log_name</span><span class="p">)</span>
151 | 
152 |     <span class="c1"># just set a few ops back, concurrent logger might not be ready</span>
153 |     <span class="k">if</span> <span class="n">log_level</span> <span class="o">&lt;=</span> <span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">:</span>
154 |         <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;🖥️ starting webcrawl MCP server&quot;</span><span class="p">)</span>
155 |         <span class="n">log_extra</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;(Debug is True)&quot;</span> <span class="k">if</span> <span class="n">DEBUG</span> <span class="k">else</span> <span class="s2">&quot;&quot;</span>
156 |         <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;log level set to </span><span class="si">{</span><span class="n">log_level</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">log_extra</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span></div>
157 | 
158 | </pre></div>
159 | 
160 |            </div>
161 |           </div>
162 |           <footer>
163 | 
164 |   <hr/>
165 | 
166 |   <div role="contentinfo">
167 |     <p>&#169; Copyright 2025, pragmar.</p>
168 |   </div>
169 | 
170 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
171 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
172 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
173 |    
174 | 
175 | </footer>
176 |         </div>
177 |       </div>
178 |     </section>
179 |   </div>
180 |   <script>
181 |       jQuery(function () {
182 |           SphinxRtdTheme.Navigation.enable(true);
183 |       });
184 |   </script> 
185 | 
186 | </body>
187 | </html>
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/querycache.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.utils.querycache &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
 48 | </ul>
 49 | 
 50 |         </div>
 51 |       </div>
 52 |     </nav>
 53 | 
 54 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 55 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 56 |           <a href="../../../index.html">mcp-server-webcrawl</a>
 57 |       </nav>
 58 | 
 59 |       <div class="wy-nav-content">
 60 |         <div class="rst-content">
 61 |           <div role="navigation" aria-label="Page navigation">
 62 |   <ul class="wy-breadcrumbs">
 63 |       <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 64 |           <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
 65 |           <li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
 66 |       <li class="breadcrumb-item active">mcp_server_webcrawl.utils.querycache</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <h1>Source code for mcp_server_webcrawl.utils.querycache</h1><div class="highlight"><pre>
 76 | <span></span><span class="kn">import</span> <span class="nn">hashlib</span>
 77 | <span class="kn">import</span> <span class="nn">time</span>
 78 | 
 79 | <div class="viewcode-block" id="QueryCountCache">
 80 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache">[docs]</a>
 81 | <span class="k">class</span> <span class="nc">QueryCountCache</span><span class="p">:</span>
 82 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 83 | <span class="sd">    A cache for storing total count results from database queries.</span>
 84 | <span class="sd">    Only caches the count integer values, as these are reusable and light.</span>
 85 | <span class="sd">    &quot;&quot;&quot;</span>
 86 | <div class="viewcode-block" id="QueryCountCache.__init__">
 87 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.__init__">[docs]</a>
 88 |     <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="nb">max</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">250</span><span class="p">,</span> <span class="n">ttl</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">900</span><span class="p">):</span>
 89 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
 90 | <span class="sd">        Initialize the query count cache.</span>
 91 | 
 92 | <span class="sd">        Parameters:</span>
 93 | <span class="sd">            max: Maximum number of entries to store in the cache</span>
 94 | <span class="sd">            ttl: Time-to-live for cache entries in seconds</span>
 95 | <span class="sd">        &quot;&quot;&quot;</span>
 96 |         <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
 97 |         <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
 98 |         <span class="bp">self</span><span class="o">.</span><span class="n">_max</span> <span class="o">=</span> <span class="nb">max</span>
 99 |         <span class="bp">self</span><span class="o">.</span><span class="n">_ttl</span> <span class="o">=</span> <span class="n">ttl</span></div>
100 | 
101 | 
102 |     <span class="k">def</span> <span class="nf">_hash_query</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
103 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
104 | <span class="sd">        Generate a hash key from a query statement and parameters.</span>
105 | 
106 | <span class="sd">        Parameters:</span>
107 | <span class="sd">            statement: SQL statement</span>
108 | <span class="sd">            params: Query parameters</span>
109 | 
110 | <span class="sd">        Returns:</span>
111 | <span class="sd">            MD5 hash of the combined query string</span>
112 | <span class="sd">        &quot;&quot;&quot;</span>
113 |         <span class="n">query</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">statement</span><span class="si">}</span><span class="s2">:</span><span class="si">{</span><span class="nb">str</span><span class="p">(</span><span class="n">params</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span>
114 |         <span class="k">return</span> <span class="n">hashlib</span><span class="o">.</span><span class="n">md5</span><span class="p">(</span><span class="n">query</span><span class="o">.</span><span class="n">encode</span><span class="p">())</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()</span>
115 | 
116 | <div class="viewcode-block" id="QueryCountCache.get">
117 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.get">[docs]</a>
118 |     <span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">int</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
119 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
120 | <span class="sd">        Get a cached count result if available and not expired.</span>
121 | 
122 | <span class="sd">        Parameters:</span>
123 | <span class="sd">            statement: SQL statement</span>
124 | <span class="sd">            params: Query parameters</span>
125 | 
126 | <span class="sd">        Returns:</span>
127 | <span class="sd">            Cached count value or None if not found or expired</span>
128 | <span class="sd">        &quot;&quot;&quot;</span>
129 |         <span class="n">key</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hash_query</span><span class="p">(</span><span class="n">statement</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
130 |         <span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span>
131 |             <span class="k">if</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">&gt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">_ttl</span><span class="p">:</span>
132 |                 <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
133 |                 <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
134 |                 <span class="k">return</span> <span class="kc">None</span>
135 |             <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
136 |         <span class="k">return</span> <span class="kc">None</span></div>
137 | 
138 | 
139 | <div class="viewcode-block" id="QueryCountCache.set">
140 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.set">[docs]</a>
141 |     <span class="k">def</span> <span class="nf">set</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">statement</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span> <span class="o">|</span> <span class="nb">str</span><span class="p">],</span> <span class="n">count</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
142 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
143 | <span class="sd">        Store a count result in the cache.</span>
144 | 
145 | <span class="sd">        Parameters:</span>
146 | <span class="sd">            statement: SQL statement</span>
147 | <span class="sd">            params: Query parameters</span>
148 | <span class="sd">            count: Count value to cache</span>
149 | <span class="sd">        &quot;&quot;&quot;</span>
150 |         <span class="n">key</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hash_query</span><span class="p">(</span><span class="n">statement</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span>
151 | 
152 |         <span class="c1"># if cache is full, remove oldest entry</span>
153 |         <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_max</span> <span class="ow">and</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">:</span>
154 |             <span class="n">oldest_key</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">k</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">k</span><span class="p">])</span>
155 |             <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">oldest_key</span><span class="p">]</span>
156 |             <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">oldest_key</span><span class="p">]</span>
157 | 
158 |         <span class="c1"># store new entry</span>
159 |         <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">count</span>
160 |         <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span></div>
161 | 
162 | 
163 | <div class="viewcode-block" id="QueryCountCache.clear">
164 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.querycache.QueryCountCache.clear">[docs]</a>
165 |     <span class="k">def</span> <span class="nf">clear</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
166 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
167 | <span class="sd">        Clear all entries from the cache.</span>
168 | <span class="sd">        &quot;&quot;&quot;</span>
169 |         <span class="bp">self</span><span class="o">.</span><span class="n">_cache</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
170 |         <span class="bp">self</span><span class="o">.</span><span class="n">_timestamps</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span></div>
171 | </div>
172 | 
173 | </pre></div>
174 | 
175 |            </div>
176 |           </div>
177 |           <footer>
178 | 
179 |   <hr/>
180 | 
181 |   <div role="contentinfo">
182 |     <p>&#169; Copyright 2025, pragmar.</p>
183 |   </div>
184 | 
185 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
186 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
187 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
188 |    
189 | 
190 | </footer>
191 |         </div>
192 |       </div>
193 |     </section>
194 |   </div>
195 |   <script>
196 |       jQuery(function () {
197 |           SphinxRtdTheme.Navigation.enable(true);
198 |       });
199 |   </script> 
200 | 
201 | </body>
202 | </html>
```
Page 6/35FirstPrevNextLast