This is page 4 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/search.py:
--------------------------------------------------------------------------------
```python
1 | import hashlib
2 | import threading
3 | from concurrent.futures import ThreadPoolExecutor, Future
4 | from datetime import datetime
5 | from typing import Optional, TYPE_CHECKING
6 |
7 | from mcp_server_webcrawl.crawlers.base.crawler import BaseJsonApi
8 | from mcp_server_webcrawl.interactive.ui import UiFocusable, UiState
9 | from mcp_server_webcrawl.models.resources import ResourceResult
10 |
11 | if TYPE_CHECKING:
12 | from mcp_server_webcrawl.interactive.session import InteractiveSession
13 |
14 | SEARCH_DEBOUNCE_DELAY_SECONDS = 0.2
15 | SEARCH_RESULT_LIMIT: int = 10
16 |
17 | class SearchManager:
18 | """
19 | Manages search operations including async search and debouncing.
20 | Works with session's controlled interface - never touches private state directly.
21 | """
22 |
23 | def __init__(self, session: 'InteractiveSession'):
24 | self.__session: 'InteractiveSession' = session
25 | self.__search_last_state_hash: str = ""
26 | self.__search_timer: Optional[threading.Timer] = None
27 | self.__executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="SearchManager")
28 | self.__search_lock: threading.RLock = threading.RLock()
29 | self.__search_in_progress: bool = False
30 | self.__active_search_future: Optional[Future] = None
31 | self.__pending_results: Optional[list[ResourceResult]] = None
32 | self.__pending_indexer_status: str = ""
33 | self.__pending_indexer_processed: int = 0
34 | self.__pending_indexer_duration: float = 0
35 | self.__pending_total: int = 0
36 |
37 | def autosearch(self, immediate: bool = False) -> None:
38 | """
39 | Trigger search with optional immediate execution.
40 |
41 | Args:
42 | immediate: If True, execute search synchronously without debouncing.
43 | If False, use debounced async execution (default).
44 | """
45 | current_state_hash: str = self.__get_input_hash()
46 |
47 | if not immediate and current_state_hash == self.__search_last_state_hash:
48 | return
49 |
50 | self.__search_last_state_hash = current_state_hash
51 | self.cancel_pending()
52 |
53 | if immediate:
54 | self.__execute_search_immediate()
55 | else:
56 | self.__search_timer = threading.Timer(SEARCH_DEBOUNCE_DELAY_SECONDS, self.__execute_debounced_search)
57 | self.__search_timer.start()
58 |
59 | def cancel_pending(self) -> None:
60 | """
61 | Cancel any pending search timer.
62 | """
63 | if self.__search_timer is not None:
64 | self.__search_timer.cancel()
65 | self.__search_timer = None
66 |
67 | with self.__search_lock:
68 | if self.__active_search_future is not None:
69 | self.__active_search_future.cancel()
70 | self.__active_search_future = None
71 |
72 | def check_pending(self) -> None:
73 | """
74 | Check if there are pending search results and update the UI.
75 | """
76 | with self.__search_lock:
77 | if self.__pending_results is not None:
78 | self.__session.results.update(self.__pending_results, self.__pending_total, self.__pending_indexer_status,
79 | self.__pending_indexer_processed, self.__pending_indexer_duration)
80 | self.__pending_results = None
81 | self.__pending_total = 0
82 | self.__pending_indexer_processed = 0
83 | self.__pending_indexer_duration = 0
84 |
85 | def cleanup(self) -> None:
86 | """
87 | Clean up any pending operations.
88 | """
89 | self.cancel_pending()
90 | self.__executor.shutdown(wait=True)
91 |
92 | def has_pending(self) -> bool:
93 | """
94 | Check if there's a pending debounced search.
95 | """
96 | return self.__search_timer is not None
97 |
98 | def is_searching(self) -> bool:
99 | """
100 | Check if a search is currently in progress or on a timer.
101 | """
102 | with self.__search_lock:
103 | return self.__search_in_progress or self.__search_timer is not None
104 |
105 | def __background_search(self) -> None:
106 | """
107 | Execute search in background thread and store results.
108 | """
109 | with self.__search_lock:
110 | self.__search_in_progress = True
111 |
112 | self.__session.searchform.set_search_attempted()
113 | results, total_results, index_status, index_processed_count, index_duration_value = self.__execute_search_query()
114 | self.__set_pending_results(results, total_results, index_status, index_processed_count, index_duration_value, False)
115 |
116 | def __build_search_query(self, base_query: str) -> str:
117 | """
118 | Build the final search query with filter applied (if present).
119 | """
120 | if self.__session.searchform.filter == "html":
121 | if base_query.strip():
122 | return f"(type: html) AND {base_query}"
123 | else:
124 | return "type: html"
125 | else:
126 | return base_query
127 |
128 | def __execute_debounced_search(self) -> None:
129 | """
130 | Execute search after debounce delay in separate thread.
131 | """
132 | current_state_hash: str = self.__get_input_hash()
133 | if current_state_hash != self.__search_last_state_hash:
134 | return
135 |
136 | # show split view on results
137 | if self.__session.ui_focused == UiFocusable.SEARCH_RESULTS:
138 | self.__session.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
139 | else:
140 | self.__session.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_FORM)
141 |
142 | self.__search_timer = None
143 | with self.__search_lock:
144 | self.__active_search_future = self.__executor.submit(self.__background_search)
145 |
146 | def __execute_search_immediate(self) -> None:
147 | """
148 | Execute search immediately on main thread (for ENTER key).
149 | """
150 | self.__session.searchform.set_search_attempted()
151 |
152 | self.__set_pending_results(None, 0, "", -1, -1, False)
153 | self.__session.results.clear()
154 | results, total_results, index_status, index_processed_count, index_duration_value = self.__execute_search_query()
155 | self.__set_pending_results(results, total_results, index_status, index_processed_count, index_duration_value, False)
156 |
157 | def __execute_search_query(self) -> tuple[list[ResourceResult], int, str, int, float]:
158 | """
159 | Centralized search execution logic shared by both sync and async paths.
160 |
161 | Returns:
162 | tuple: (results, total_results, index_status, index_processed_count, index_duration_value)
163 | """
164 | api: BaseJsonApi | None = self.__get_results(offset=self.__session.searchform.offset)
165 |
166 | if api is None:
167 | return [], 0, 0, 0
168 |
169 | results: list[ResourceResult] = api.get_results()
170 | total_results: int = api.total
171 |
172 | index_status: str = ""
173 | index_processed_count: int = -1
174 | index_duration_value: float = -1
175 |
176 | if api.meta_index is not None:
177 | if "status" in api.meta_index:
178 | index_status = api.meta_index["status"]
179 | if "processed" in api.meta_index:
180 | index_processed_count = api.meta_index["processed"]
181 |
182 | if "duration" in api.meta_index:
183 | index_duration_string: str = api.meta_index["duration"] or ""
184 | if index_duration_string:
185 | try:
186 | dt: datetime = datetime.strptime(index_duration_string, "%H:%M:%S.%f")
187 | index_duration_value = dt.hour * 3600 + dt.minute * 60 + dt.second + dt.microsecond / 1000000
188 | except ValueError:
189 | index_duration_value = 0
190 |
191 | return results, total_results, index_status, index_processed_count, index_duration_value
192 |
193 | def __get_input_hash(self) -> str:
194 | """
195 | Generate a hash representing the complete current search state.
196 | """
197 | query: str = self.__session.searchform.query.strip()
198 | selected_sites = self.__session.searchform.get_selected_sites()
199 | selected_sites_ids: list[int] = [s.id for s in selected_sites]
200 | filter: str = str(self.__session.searchform.filter)
201 | sort: str = str(self.__session.searchform.sort)
202 | offset: int = self.__session.searchform.offset
203 | limit: int = self.__session.searchform.limit
204 | search_state: str = f"{query}|{selected_sites_ids}|{filter}|{offset}|{limit}|{sort}"
205 | return hashlib.md5(search_state.encode()).hexdigest()
206 |
207 | def __get_results(self, offset: int = 0) -> BaseJsonApi | None:
208 | """
209 | Execute search with given offset and return API response object.
210 | Centralizes the API call logic used by both sync and async search paths.
211 |
212 | Args:
213 | offset: Starting position for search results pagination
214 |
215 | Returns:
216 | BaseJsonApi: API response object containing search results and metadata
217 | """
218 | selected_site_ids: list[int] = self.__get_selected_site_ids()
219 |
220 | query: str = self.__build_search_query(self.__session.searchform.query)
221 | sort: str = self.__session.searchform.sort
222 | query_api: BaseJsonApi = self.__session.crawler.get_resources_api(
223 | sites=selected_site_ids if selected_site_ids else None,
224 | query=query,
225 | fields=["size", "status"],
226 | offset=offset,
227 | limit=SEARCH_RESULT_LIMIT,
228 | extras=["snippets"],
229 | sort=sort
230 | )
231 |
232 | return query_api
233 |
234 | def __get_selected_site_ids(self) -> list[int]:
235 | """
236 | Get list of selected site IDs using property access.
237 | """
238 | selected_sites = self.__session.searchform.get_selected_sites()
239 | return [site.id for site in selected_sites]
240 |
241 | def __set_pending_results(self, results, total_results, index_status, index_processed_count, index_duration_value, search_in_progress) -> None:
242 | try:
243 | with self.__search_lock:
244 | self.__pending_results = results
245 | self.__pending_total = total_results
246 | self.__pending_indexer_status = index_status
247 | self.__pending_indexer_processed = index_processed_count
248 | self.__pending_indexer_duration = index_duration_value
249 | self.__search_in_progress = search_in_progress
250 |
251 | except Exception as ex:
252 | with self.__search_lock:
253 | self.__session.results.clear()
254 | self.__search_in_progress = False
255 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/extras/thumbnails.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import aiohttp
3 | import asyncio
4 | import base64
5 | import concurrent
6 | import hashlib
7 | import io
8 | import re
9 | import threading
10 | import traceback
11 |
12 | from datetime import datetime, timedelta
13 | from pathlib import Path
14 | from urllib.parse import ParseResult, urlparse
15 | from PIL import Image
16 |
17 | from mcp_server_webcrawl.settings import DATA_DIRECTORY
18 | from mcp_server_webcrawl.utils.logger import get_logger
19 |
20 | HTTP_THREADS: int = 8
21 | ALLOWED_THUMBNAIL_TYPES = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
22 | MAX_THUMBNAIL_BYTES = 2 * 1024 * 1024 # 2MB cap
23 |
24 | logger = get_logger()
25 |
26 | class ThumbnailManager:
27 | """
28 | Manages thumbnail generation and caching for image files and URLs.
29 | """
30 |
31 | def __init__(self):
32 | DATA_DIRECTORY.mkdir(parents=True, exist_ok=True)
33 | assert DATA_DIRECTORY.is_dir(), f"DATA_DIRECTORY {DATA_DIRECTORY} is not a directory"
34 | self.__temp_directory: Path = DATA_DIRECTORY / "thumb"
35 | if not self.__temp_directory.is_dir():
36 | self.__temp_directory.mkdir(parents=True, exist_ok=True)
37 | os.chmod(self.__temp_directory, 0o700)
38 |
39 | def __md5(self, path: str) -> str:
40 | return hashlib.md5(path.encode()).hexdigest()
41 |
42 | def __is_valid_url(self, path: str) -> tuple[bool, ParseResult | None]:
43 | try:
44 | result = urlparse(path)
45 | return all([result.scheme, result.netloc]), result
46 | except:
47 | return False, None
48 |
49 | def __is_valid_file(self, path: str) -> bool:
50 | return Path(path).is_file()
51 |
52 | def __get_temp_file(self, key: str) -> Path:
53 | return self.__temp_directory / f"{key}.webp"
54 |
55 | def __get_extension(self, path: str) -> str | None:
56 | ext = Path(path).suffix.lower()
57 | if ext:
58 | return ext
59 |
60 | # try to parse extension from the path
61 | is_valid, parsed = self.__is_valid_url(path)
62 | if is_valid:
63 | path_parts = parsed.path.split("/")
64 | if path_parts:
65 | last_part = path_parts[-1]
66 | if "." in last_part:
67 | return "." + last_part.split(".")[-1].lower()
68 |
69 | return None
70 |
71 | def __is_allowed_type(self, path: str) -> bool:
72 | ext = self.__get_extension(path)
73 | return ext in ALLOWED_THUMBNAIL_TYPES if ext else False
74 |
75 | def __clean_thumbs_directory(self):
76 | try:
77 | md5_pattern: re.Pattern = re.compile(r"^[0-9a-f]{32}$")
78 | cutoff_time: timedelta = datetime.now() - timedelta(hours=4)
79 | deleted_count: int = 0
80 | for file_path in self.__temp_directory.glob("*"):
81 | if not file_path.is_file():
82 | continue
83 | if not md5_pattern.match(file_path.name):
84 | continue
85 | file_mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
86 | if file_mtime < cutoff_time:
87 | file_path.unlink()
88 | deleted_count += 1
89 | logger.info(f"Temporary file cleanup complete: {deleted_count} files deleted")
90 | except Exception as ex:
91 | logger.error(
92 | f"Error during temporary file cleanup: {str(ex)}\n{traceback.format_exc()}"
93 | )
94 |
95 | def __check_content_length(self, headers) -> bool:
96 | """Helper to check if content length is acceptable"""
97 | if "Content-Length" in headers:
98 | content_length = int(headers["Content-Length"])
99 | if content_length > MAX_THUMBNAIL_BYTES:
100 | logger.info(
101 | f"Skipping large file ({content_length} bytes > "
102 | f"{MAX_THUMBNAIL_BYTES} bytes)"
103 | )
104 | return False
105 | return True
106 |
107 | async def __fetch_url(
108 | self, session: aiohttp.ClientSession, url: str, key: str
109 | ) -> str | None:
110 | temp_file = self.__get_temp_file(key)
111 | try:
112 | # check HEAD to get Content-Length without downloading
113 | async with session.head(url, timeout=1, allow_redirects=True) as head_response:
114 | if head_response.status == 200 and not self.__check_content_length(head_response.headers):
115 | return None
116 |
117 | async with session.get(url, timeout=2) as response:
118 | if response.status != 200:
119 | return None
120 |
121 | if not self.__check_content_length(response.headers):
122 | return None
123 |
124 | # stream the content with a size limit
125 | content = bytearray()
126 | chunk_size = 8192 # 8KB chunks
127 | total_size = 0
128 |
129 | async for chunk in response.content.iter_chunked(chunk_size):
130 | total_size += len(chunk)
131 | if total_size > MAX_THUMBNAIL_BYTES:
132 | logger.info(
133 | f"Download exceeded size limit of {MAX_THUMBNAIL_BYTES} bytes "
134 | f"while streaming"
135 | )
136 | return None
137 | content.extend(chunk)
138 |
139 | return self.__process_image_data(bytes(content), temp_file)
140 | except (aiohttp.ClientError, asyncio.TimeoutError) as ex:
141 | # http is the wild west, keep chugging
142 | logger.debug(f"HTTP error: {str(ex)}")
143 | return None
144 |
145 | def __process_image_data(self, data: bytes, temp_file: Path) -> str | None:
146 | """Process image data, save to temp file, and return base64 encoding"""
147 | thumbnail = self.__create_webp_thumbnail(data)
148 | if thumbnail is not None:
149 | temp_file.write_bytes(thumbnail)
150 | return base64.b64encode(thumbnail).decode("utf-8")
151 | return None
152 |
153 | async def __get_file(self, path: str, key: str) -> str | None:
154 | try:
155 | file_path = Path(path)
156 | content = file_path.read_bytes()
157 | temp_file = self.__get_temp_file(key)
158 | return self.__process_image_data(content, temp_file)
159 | except Exception as ex:
160 | logger.debug(f"File error: {str(ex)}")
161 | return None
162 |
163 | async def __process_path(
164 | self,
165 | session: aiohttp.ClientSession,
166 | path: str,
167 | results: dict[str, str | None],
168 | metrics: dict[str, int]
169 | ) -> None:
170 | key: str = self.__md5(path)
171 | temp_file: Path = self.__get_temp_file(key)
172 |
173 | is_valid_url, _ = self.__is_valid_url(path)
174 | valid_file: bool = self.__is_valid_file(path)
175 |
176 | if not (is_valid_url or valid_file) or not self.__is_allowed_type(path):
177 | return
178 |
179 | # cache hit
180 | if temp_file.exists():
181 | content: bytes = temp_file.read_bytes()
182 | results[path] = base64.b64encode(content).decode("utf-8")
183 | metrics["total_cached"] += 1
184 | return
185 |
186 | result: str | None = await self.__fetch_url(session, path, key) if is_valid_url else await self.__get_file(path, key)
187 | results[path] = result
188 | if result is None:
189 | metrics["total_errors"] += 1
190 | else:
191 | metrics["total_returned"] += 1
192 |
193 | async def __get_blobs_async(self, paths: list[str]) -> dict[str, str | None]:
194 | results = {path: None for path in paths}
195 | metrics = {
196 | "total_requested": len(paths),
197 | "total_returned": 0,
198 | "total_errors": 0,
199 | "total_cached": 0
200 | }
201 |
202 | async with aiohttp.ClientSession() as session:
203 | # Process tasks in batches of HTTP_THREADS
204 | for i in range(0, len(paths), HTTP_THREADS):
205 | batch_paths = paths[i:i + HTTP_THREADS]
206 | batch_tasks = [
207 | self.__process_path(session, path, results, metrics)
208 | for path in batch_paths
209 | ]
210 | await asyncio.gather(*batch_tasks)
211 |
212 | logger.info(
213 | f"Found {metrics['total_requested']}, fetched {metrics['total_returned']} "
214 | f"({metrics['total_errors']} errors, {metrics['total_cached']} cached)"
215 | )
216 |
217 | return results
218 |
219 | def __create_webp_thumbnail(self, image_data: bytes, size: int = 512) -> bytes | None:
220 | img = None
221 | try:
222 | img = Image.open(io.BytesIO(image_data))
223 | width, height = img.size
224 | max_dimension = max(width, height)
225 |
226 | if max_dimension > size:
227 | if width > height:
228 | new_width = size
229 | new_height = int(height * (new_width / width))
230 | else:
231 | new_height = size
232 | new_width = int(width * (new_height / height))
233 | img = img.resize((new_width, new_height), Image.LANCZOS)
234 |
235 | output = io.BytesIO()
236 | img.save(
237 | output,
238 | format="WEBP",
239 | quality=20,
240 | optimize=True,
241 | method=6 # highest compression
242 | )
243 | return output.getvalue()
244 | except Exception as ex:
245 | logger.error(f"Error creating WebP thumbnail: {str(ex)}\n{traceback.format_exc()}")
246 | return None
247 | finally:
248 | if img is not None:
249 | img.close()
250 |
251 | def get_thumbnails(self, paths: list[str]) -> dict[str, str | None]:
252 | """
253 | Convert URLs or file paths to base64 encoded strings.
254 |
255 | Args:
256 | paths: List of URLs or file paths to convert
257 |
258 | Returns:
259 | Dictionary mapping paths to their base64 representation or None if failed
260 | """
261 | assert paths is not None, "paths must be a list[str]"
262 |
263 | def run_in_thread():
264 | loop = asyncio.new_event_loop()
265 | asyncio.set_event_loop(loop)
266 | try:
267 | return loop.run_until_complete(self.__get_blobs_async(paths))
268 | finally:
269 | loop.close()
270 |
271 | try:
272 | with concurrent.futures.ThreadPoolExecutor() as executor:
273 | future = executor.submit(run_in_thread)
274 | results = future.result(timeout=5)
275 |
276 | # start cleanup in a background thread
277 | cleanup_thread = threading.Thread(target=self.__clean_thumbs_directory)
278 | cleanup_thread.daemon = True
279 | cleanup_thread.start()
280 |
281 | return results
282 | except Exception as ex:
283 | logger.error(f"Error fetching thumbnails: {ex}\n{traceback.format_exc()}")
284 | return {path: None for path in paths}
285 |
```
--------------------------------------------------------------------------------
/prompts/auditseo.md:
--------------------------------------------------------------------------------
```markdown
1 | # Website SEO Audit Test Instructions
2 |
3 | ## Query Sequence
4 |
5 | ### 1. Identify Target Domain & Homepage
6 |
7 | **FIRST:** Get available sites and let user choose:
8 | ```
9 | webcrawl_sites() - get all available domains
10 | ```
11 |
12 | **THEN:** Find homepage with sorted URL approach:
13 | ```
14 | query: type: html AND url: [target_site_domain]
15 | limit: 1
16 | sites: [target_site_id]
17 | fields: ["content"]
18 | sort: +url
19 | ```
20 |
21 | **Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
22 |
23 | ### 2. Get Domain-Specific Structure Overview
24 |
25 | Use the arguments collected thus far to query content of representative pages. This is a large set, so keep the API fields empty to reduce tokens.
26 |
27 | ```
28 | query: type: html AND url: [target_site_domain]
29 | limit: 100
30 | sites: [target_site_id]
31 | fields: []
32 | sort: +url
33 | ```
34 |
35 | **Purpose:** Get homepage first, then analyze remaining 99 pages from the specified domain only to identify page templates and URL patterns.
36 |
37 | ### 3. Analyze URL Patterns
38 |
39 | From the 100 results, put the homepage ID aside, then identify:
40 |
41 | - **Directory distribution:** different directorys indicate site sections
42 | - **URL patterns:** `/`, `/blog/`, `/blog/post/1/`, `/products/product/`, `/feature/title`, etc.
43 | - **Content types:** articles, directories, categories, profiles, press releases, tools
44 | - **Homepage identification:** Look for root domain URLs or shortest paths
45 |
46 | If you feel the 100 results are not representative of the website (e.g. 1 homepage and 99 product pages), you can use the same query with RANDOM sort (?) LIMIT 100 to see a sampled page set. Since RANDOM sort leads to nonrepeatable behaviors in the audit, it should be used sparingly, and only when necessary to get a more diverse set of page templates.
47 |
48 | ### 4. Select Representative Sample
49 |
50 | Choose 4 more pages covering a diverse set of templates, identified by unique pattern, prioritized by page count:
51 |
52 | Important. You will cycle webpage content one at a time to prevent hitting response size limits.
53 |
54 | ```
55 | query: id: [page_id]
56 | fields: ["content"]
57 | limit: 1
58 | sites: [target_site_id]
59 | ```
60 |
61 | If you see the first result page size is manageable, you can try 2 at a time. But don't get greedy.
62 |
63 | ```
64 | query: id: [page1_id] OR id: [page2_id]
65 | fields: ["content"]
66 | limit: 2
67 | sites: [target_site_id]
68 | ```
69 |
70 | **Sample selection strategy:**
71 |
72 | - 1 homepage (if identifiable)
73 | - 1-2 category pages (blog, products, news)
74 | - 1-3 detail pages (profiles, archives)
75 | - if limited pages, take what you can get
76 |
77 | ### 5. Analyze Each Page Type
78 |
79 | For each sampled page, extract and analyze using the provided analysis framework.
80 |
81 | ### 6. Offer to Expand or Edit Selected Reference Pages
82 |
83 | Upon audit completion, the user may desire to expand the surface area of the test, or audit specific pages. Give them this opportunity as the final word post-report.
84 |
85 | ### 7. Offer Advanced Analysis or Tool Research
86 |
87 | After completing the main audit report, offer the user two additional options:
88 | - **Detailed Analysis:** More comprehensive investigation of specific SEO issues or page types
89 | - **Tool Research:** Research and recommend specific tools to address identified SEO problems
90 |
91 | ## SEO Elements Analysis Framework
92 |
93 | ### Title Tag Analysis
94 |
95 | **Extract:** `<title>` content
96 | **Check for:**
97 | - **Length:** 30-60 characters optimal (Google displays ~60)
98 | - **Uniqueness:** No duplicate titles across pages
99 | - **Keyword inclusion:** Primary keywords in first 50 characters
100 | - **Brand consistency:** Proper "- NASA" suffix usage
101 | - **Descriptiveness:** Clear, specific page purpose
102 | - **Keyword stuffing:** Excessive keyword repetition
103 |
104 | ### Meta Description Analysis
105 |
106 | **Extract:** `<meta name="description" content="...">`
107 | **Check for:**
108 | - **Length:** 120-158 characters optimal
109 | - **Completeness:** No truncated sentences
110 | - **Uniqueness:** No duplicate descriptions
111 | - **Call-to-action:** Encouraging click-through
112 | - **Keyword relevance:** Natural keyword inclusion
113 | - **Missing descriptions:** Pages without meta descriptions
114 |
115 | ### Header Structure Analysis
116 |
117 | **Extract:** `<h1>`, `<h2>`, `<h3>`, `<h4>`, `<h5>`, `<h6>` tags
118 |
119 | **Check for:**
120 | - **H1 uniqueness:** Single H1 per page (SEO best practice)
121 | - **H1 relevance:** Matches title tag intent
122 | - **Logical hierarchy:** Proper H1→H2→H3 structure
123 | - **Keyword optimization:** Headers include relevant keywords naturally
124 | - **Length appropriateness:** Headers not too long/short
125 | - **Missing H1:** Pages without primary headers
126 |
127 | ### Content Quality Indicators
128 |
129 | **Analyze for:**
130 | - **Keyword density:** 1-3% for primary keywords (not stuffing)
131 | - **Content length:** Sufficient depth for topic coverage
132 | - **Readability:** Clear, scannable content structure
133 | - **Internal linking:** Proper cross-references to related NASA content
134 | - **Image alt text:** Descriptive alt attributes (check `<img alt="">`)
135 | - **Duplicate content:** Similar content across multiple pages
136 |
137 | ### Technical SEO Elements
138 |
139 | **Extract and verify:**
140 | - **Canonical URLs:** `<link rel="canonical">`
141 | - **Open Graph tags:** og:title, og:description, og:image
142 | - **Schema markup:** JSON-LD structured data
143 | - **Language declarations:** `<html lang="en">` attributes
144 | - **Mobile viewport:** `<meta name="viewport">` tag
145 |
146 | ## Common SEO Issues to Identify
147 |
148 | ### High Priority Issues
149 |
150 | 1. **Missing H1 tags** or multiple H1s per page
151 | 2. **Duplicate title tags** across different pages
152 | 3. **Missing meta descriptions** (search engines generate snippets)
153 | 4. **Title/description length violations** (truncation in SERPs)
154 | 5. **Broken header hierarchy** (H3 before H2, etc.)
155 |
156 | ### Medium Priority Issues
157 |
158 | 1. **Generic titles** ("Page Title" or "Untitled")
159 | 2. **Keyword stuffing** in titles, descriptions, or headers
160 | 3. **Inconsistent brand suffixes** (some pages missing "- NASA")
161 | 4. **Overly long headers** (H1 > 70 characters)
162 | 5. **Missing alt text** on images
163 |
164 | ### Low Priority Issues
165 |
166 | 1. **Suboptimal keyword placement** in headers
167 | 2. **Minor length optimizations** for titles/descriptions
168 | 3. **Header structure improvements** (adding H2s for better organization)
169 |
170 | ## Page Type Categorization
171 |
172 | ### Homepage/Landing Pages
173 |
174 | - **Expectation:** Strong H1, compelling meta description, comprehensive title
175 | - **Common issues:** Generic titles, keyword stuffing attempts
176 |
177 | ### Mission/Technical Pages
178 |
179 | - **Expectation:** Technical accuracy, proper header hierarchy for complex content
180 | - **Common issues:** Missing H1s, overly technical meta descriptions
181 |
182 | ### Blog/News Articles
183 |
184 | - **Expectation:** Date relevance, engaging headlines as H1s
185 | - **Common issues:** Duplicate meta descriptions, poor header structure
186 |
187 | ### Gallery/Media Pages
188 |
189 | - **Expectation:** Descriptive titles, image-focused meta descriptions
190 | - **Common issues:** Generic titles like "Image Gallery", missing alt text
191 |
192 | ### Documentation Pages
193 |
194 | - **Expectation:** Clear navigation headers, searchable content
195 | - **Common issues:** Poor hierarchy, missing descriptions
196 |
197 | ## Reporting Template
198 |
199 | ### Executive Summary
200 |
201 | - **Total pages analyzed:** X pages across Y page types
202 | - **Overall SEO health:** [A-F grade] based on critical issues and optimization opportunities
203 | - **Critical issues requiring immediate attention:** X issues
204 | - **Priority recommendations:** Top 3 actionable improvements
205 |
206 | ### Detailed Findings by Element
207 |
208 | #### Title Tag Issues
209 |
210 | - **Pages with optimal titles (30-60 chars):** X% (Y pages)
211 | - **Pages with missing titles:** X pages
212 | - **Pages with duplicate titles:** X pages
213 | - **Pages with keyword stuffing:** X pages
214 | - **Examples of problematic titles:**
215 | - Too long: `[Example Title That Exceeds 60 Characters And Will Be Truncated In Search Results]`
216 | - Too short: `[NASA]`
217 | - Duplicate: `[Same title found on 3 pages]`
218 |
219 | #### Meta Description Issues
220 |
221 | - **Pages with optimal descriptions (120-158 chars):** X% (Y pages)
222 | - **Pages missing descriptions:** X pages
223 | - **Pages with duplicate descriptions:** X pages
224 | - **Examples of issues:**
225 | - Truncated: `[Description that cuts off mid-sentence in search...]`
226 | - Missing: `[Page URL with no meta description]`
227 | - Duplicate: `[Same description on X pages]`
228 |
229 | #### Header Structure Issues
230 |
231 | - **Pages with proper H1:** X% (Y pages)
232 | - **Pages with multiple H1s:** X pages
233 | - **Pages with broken hierarchy:** X pages
234 | - **Pages missing H1:** X pages
235 | - **Examples:**
236 | - Multiple H1s: `[Page with H1 "Mission Overview" and H1 "Technical Details"]`
237 | - Broken hierarchy: `[H1→H3 (missing H2)]`
238 | - Missing H1: `[Page URL starting with H2]`
239 |
240 | ### Page Type Performance Matrix
241 |
242 | | Page Type | Sample Size | Title Issues | Description Issues | Header Issues | Overall Grade |
243 | |-----------|-------------|--------------|-------------------|---------------|---------------|
244 | | Homepage | 1 | 0 | 0 | 0 | A |
245 | | Mission Pages | 3 | 1 | 2 | 1 | B- |
246 | | Blog Articles | 3 | 0 | 1 | 2 | C+ |
247 | | Gallery Pages | 2 | 2 | 2 | 1 | D |
248 | | Documentation | 1 | 0 | 0 | 1 | B |
249 |
250 | ### Quick Wins for Immediate Impact
251 |
252 | - **Template updates:** Fix recurring issues at template level (affects multiple pages instantly)
253 | - **Missing meta descriptions:** Add descriptions to pages without them (immediate SERP improvement)
254 | - **Duplicate title resolution:** Update identical titles to be unique and descriptive
255 | - **H1 hierarchy fixes:** Ensure single H1 per page and proper header structure
256 |
257 | ## What's Next?
258 |
259 | You've got a solid foundation with some clear optimization opportunities ahead. Depending on what the audit uncovered, you might be looking at quick wins like title tag improvements, meta description fixes, or header structure cleanup - the kind of changes that can make a real difference with minimal effort.
260 |
261 | **Ready to dive deeper?** I can help you:
262 | - **Focus on specific fixes** - Whether it's duplicate content, missing descriptions, or technical SEO gaps, let's tackle your highest-impact items with detailed implementation steps
263 | - **Expand the audit** - Analyze more pages, a single page, or dive into advanced technical elements
264 | - **Research tools** - Find specific solutions for ongoing SEO concerns or content optimization workflows
265 |
266 | **What would be most helpful for your next steps?**
267 |
268 | ## Methodology
269 |
270 | You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.
271 |
272 | Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/utils/cli.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.utils.cli — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item"><a href="../utils.html">mcp_server_webcrawl.utils</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.utils.cli</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.utils.cli</h1><div class="highlight"><pre>
77 | <span></span><span class="k">def</span> <span class="nf">__cli_apply_color</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">code</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
78 | <span class="k">return</span> <span class="sa">f</span><span class="s2">"</span><span class="se">\033</span><span class="s2">[</span><span class="si">{</span><span class="n">code</span><span class="si">}</span><span class="s2">m</span><span class="si">{</span><span class="n">text</span><span class="si">}</span><span class="se">\033</span><span class="s2">[0m"</span>
79 |
80 | <span class="k">def</span> <span class="nf">__cli_light_gray</span><span class="p">(</span><span class="n">text</span><span class="p">):</span>
81 | <span class="k">return</span> <span class="n">__cli_apply_color</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="s2">"38;2;130;130;130"</span><span class="p">)</span>
82 |
83 | <span class="k">def</span> <span class="nf">__cli_gold</span><span class="p">(</span><span class="n">text</span><span class="p">):</span>
84 | <span class="k">return</span> <span class="n">__cli_apply_color</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="s2">"38;2;170;120;0"</span><span class="p">)</span>
85 |
86 | <div class="viewcode-block" id="get_help_short_message">
87 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.cli.get_help_short_message">[docs]</a>
88 | <span class="k">def</span> <span class="nf">get_help_short_message</span><span class="p">(</span><span class="n">version</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
89 | <span class="k">return</span> <span class="sa">f</span><span class="s2">"""</span><span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'mcp-server-webcrawl'</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">__cli_light_gray</span><span class="p">(</span><span class="s2">"v"</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">version</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s1">', ©2025 MPL2,'</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'--help'</span><span class="p">)</span><span class="si">}</span><span class="s2"> for more information"""</span></div>
90 |
91 |
92 | <div class="viewcode-block" id="get_help_long_message">
93 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.utils.html#mcp_server_webcrawl.utils.cli.get_help_long_message">[docs]</a>
94 | <span class="k">def</span> <span class="nf">get_help_long_message</span><span class="p">(</span><span class="n">version</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
95 | <span class="k">return</span> <span class="sa">f</span><span class="s2">"""A server to connect your web crawls/archives to an LLM via MCP (Model Context Protocol).</span>
96 |
97 | <span class="s2">Usage: </span><span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'mcp-server-webcrawl'</span><span class="p">)</span><span class="si">}</span><span class="s2"> [-c </span><span class="se">{{</span><span class="s2">wget,warc,interrobot,katana,siteone</span><span class="se">}}</span><span class="s2">] [-d DATASRC]</span>
98 |
99 | <span class="s2">Options:</span>
100 | <span class="s2"> -c, --crawler Specify which crawler to use</span>
101 | <span class="s2"> -d, --datasrc Path to datasrc (required unless testing)</span>
102 | <span class="s2"> -h, --help Show this help message and exit</span>
103 | <span class="s2"> -i, --interactive Run interactive terminal search</span>
104 |
105 | <span class="s2">Where is my DATASRC?</span>
106 | <span class="s2"> archivebox Directory above one or more archivebox init'ed dirs</span>
107 | <span class="s2"> httrack Projects directory (~/websites/, /My Websites/)</span>
108 | <span class="s2"> interrobot Path to */interrobot.v2.db</span>
109 | <span class="s2"> katana Directory containing the webroot archives</span>
110 | <span class="s2"> siteone Directory containing the webroot archives</span>
111 | <span class="s2"> (requires archive option)</span>
112 | <span class="s2"> warc Directory containing WARC files</span>
113 | <span class="s2"> wget Directory containing the webroot archives</span>
114 |
115 | <span class="s2"> [DATASRC]</span>
116 | <span class="s2"> ╭─────────────────────────────────╮</span>
117 | <span class="s2"> ✧───────────────────────✧ ✧───────────────────────✧</span>
118 | <span class="s2"> ╱ example.com (webroot) ╱ ╱ pragmar.com (webroot) ╱</span>
119 | <span class="s2"> ✧───────────────────────✧ ✧───────────────────────✧</span>
120 |
121 | <span class="s2">MCP Configuration Example:</span>
122 | <span class="se">{{</span><span class="s2">"mcpServers": </span><span class="se">{{</span>
123 | <span class="s2"> "wget": </span><span class="se">{{</span>
124 | <span class="s2"> "command": "/path/to/mcp-server-webcrawl",</span>
125 | <span class="s2"> "args": ["--crawler", "wget", "--datasrc",</span>
126 | <span class="s2"> "/path/to/archived/hosts/"]</span><span class="se">}}</span>
127 | <span class="s2"> </span><span class="se">}}</span>
128 | <span class="se">}}</span>
129 |
130 | <span class="si">{</span><span class="n">__cli_gold</span><span class="p">(</span><span class="s1">'mcp-server-webcrawl'</span><span class="p">)</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">__cli_light_gray</span><span class="p">(</span><span class="s2">"v"</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">version</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s1">', ©2025 MPL2'</span><span class="p">)</span><span class="si">}</span>
131 | <span class="s2">https://github.com/pragmar/mcp-server-webcrawl</span>
132 | <span class="s2">"""</span></div>
133 |
134 | </pre></div>
135 |
136 | </div>
137 | </div>
138 | <footer>
139 |
140 | <hr/>
141 |
142 | <div role="contentinfo">
143 | <p>© Copyright 2025, pragmar.</p>
144 | </div>
145 |
146 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
147 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
148 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
149 |
150 |
151 | </footer>
152 | </div>
153 | </div>
154 | </section>
155 | </div>
156 | <script>
157 | jQuery(function () {
158 | SphinxRtdTheme.Navigation.enable(true);
159 | });
160 | </script>
161 |
162 | </body>
163 | </html>
```
--------------------------------------------------------------------------------
/prompts/auditfiles.md:
--------------------------------------------------------------------------------
```markdown
1 | # Website File Type Audit Instructions
2 |
3 | ## Query Sequence
4 |
5 | ### 1. Identify Target Domain & Homepage
6 |
7 | **FIRST:** Get available sites and let user choose:
8 | ```
9 | webcrawl_sites() - get all available domains
10 | ```
11 |
12 | **THEN:** Find homepage or directory index with sorted URL approach:
13 | ```
14 | query: type: html AND url: [target_site_domain]
15 | limit: 1
16 | sites: [target_site_id]
17 | sort: +url
18 | ```
19 |
20 | **Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
21 |
22 | ### 2. Core File Type Analysis
23 |
24 | Run separate queries for high-volume file types to get accurate counts and understand scale:
25 |
26 | **HTML Pages:**
27 | ```
28 | query: type: html
29 | limit: 100
30 | sites: [target_site_id]
31 | fields: ["size"]
32 | ```
33 |
34 | **Images:**
35 | ```
36 | query: type: img
37 | limit: 100
38 | sites: [target_site_id]
39 | fields: ["size"]
40 | ```
41 |
42 | **JavaScript Files:**
43 | ```
44 | query: type: script
45 | limit: 100
46 | sites: [target_site_id]
47 | fields: ["size"]
48 | ```
49 |
50 | **CSS Stylesheets:**
51 | ```
52 | query: type: style
53 | limit: 100
54 | sites: [target_site_id]
55 | fields: ["size"]
56 | ```
57 |
58 | ### 3. Specialized File Types
59 |
60 | Combine lower-volume file types in grouped queries:
61 |
62 | **Media & Interactive:**
63 | ```
64 | query: type: audio OR type: video OR type: iframe OR type: font OR type: text OR type: rss OR type: other
65 | limit: 100
66 | sites: [target_site_id]
67 | fields: ["size"]
68 | sort: +id
69 | ```
70 |
71 | ### 4. Internal vs External Asset Analysis
72 |
73 | If any file type shows 100+ results, segment by domain to understand hosting strategy:
74 |
75 | **Internal Assets (same domain):**
76 | ```
77 | query: type: [file_type] AND url: [target_site_domain]
78 | limit: 100
79 | sites: [target_site_id]
80 | fields: ["size"]
81 | ```
82 |
83 | **External Assets (CDNs, third-party):**
84 | ```
85 | query: type: [file_type] AND NOT url: [target_site_domain]
86 | limit: 100
87 | sites: [target_site_id]
88 | fields: ["size"]
89 | ```
90 |
91 | **Apply this segmentation to:** HTML, images, scripts, and styles if they exceed 100 results. Note total result counts, but use response results as a representative sample as you will not be able to analyze all resources of large sites.
92 |
93 | ### 5. Asset Distribution Mapping
94 |
95 | From the results, extract domain patterns for external assets:
96 | - **CDN domains:** `cdn.`, `static.`, `assets.`, `media.`
97 | - **Third-party services:** Google Fonts, jQuery CDN, analytics
98 | - **Subdomain strategy:** Different subdomains for different asset types
99 |
100 | ### 6. Offer Advanced Analysis or Tool Research
101 |
102 | After completing the main audit report, offer the user two additional options:
103 | - **Detailed Analysis:** More comprehensive investigation of specific file types, asset organization patterns, or optimization opportunities
104 | - **Tool Research:** Research and recommend specific tools to address identified file management and optimization issues
105 |
106 | ## File Type Analysis Framework
107 |
108 | ### HTML Analysis
109 |
110 | **Metrics to extract:**
111 | - **Total pages:** Count of HTML files (use result totals)
112 | - **Segment by directory/path:** Count files by URL segments
113 | - **URL structure patterns:** Directory organization insights
114 |
115 | ### Images Analysis
116 |
117 | **Metrics to extract:**
118 | - **Total images:** Count and estimated storage impact
119 | - **Format distribution:** JPG, PNG, SVG, GIF, WebP usage
120 | - **Hosting strategy:** Self-hosted vs CDN distribution
121 | - **Directory patterns:** `/images/`, `/media/`, organized structure
122 | - **Optimization indicators:** Large files, legacy formats
123 |
124 | ### JavaScript Analysis
125 |
126 | **Metrics to extract:**
127 | - **Script count:** Total JS files and hosting distribution
128 | - **Library identification:** jQuery, React, analytics scripts
129 | - **Bundle strategy:** Many small files vs consolidated bundles
130 | - **Third-party dependencies:** External library usage
131 | - **Performance patterns:** Blocking vs async loading indicators
132 |
133 | ### CSS Architecture Analysis
134 |
135 | **Metrics to extract:**
136 | - **Stylesheet count:** Total CSS files and organization
137 | - **Framework usage:** Bootstrap, Foundation, custom frameworks
138 | - **Asset delivery:** Inline vs external, CDN usage
139 | - **File size distribution:** Large framework files vs custom styles
140 |
141 | ### Media & Interactive Content
142 |
143 | **Metrics to extract:**
144 | - **Video/Audio:** Count, hosting strategy, streaming vs download
145 | - **Fonts:** Font names, and combined size (w/ italic, bold variants)
146 | - **RSS Feeds:** Check for existence
147 |
148 | ## Asset Strategy Analysis
149 |
150 | ### Third-Party CDNs
151 |
152 | **Scope:** External domains (cdnjs, jsdelivr, unpkg, Google)
153 | - External dependency management
154 | - Performance vs reliability trade-offs
155 | - Popular library and framework delivery
156 |
157 | ### Content Distribution Analysis
158 |
159 | - **Asset sizes:** Images, scripts, and styles within reasonable range
160 | - **Asset consolidation score:** How well assets are organized
161 | - **Performance optimization:** CDN usage effectiveness
162 | - **Dependency risk:** External service reliability
163 | - **Maintenance complexity:** Multi-domain asset management
164 |
165 | ## Common File Organization Issues
166 |
167 | ### High Priority Issues
168 |
169 | 1. **Oversized assets:** Images >2MB, JS bundles >500KB, CSS files >200KB
170 | 2. **Legacy format usage:** GIF animations, uncompressed images, outdated JS libraries
171 | 3. **Asset sprawl:** Files scattered across multiple domains without clear strategy
172 | 4. **Missing CDN usage:** Large assets served from main domain affecting performance
173 | 5. **Orphaned files:** Assets not referenced by any HTML pages
174 |
175 | ### Medium Priority Issues
176 |
177 | 1. **Suboptimal file formats:** JPG for graphics, PNG for photos, missing WebP adoption
178 | 2. **Bundle fragmentation:** Many small JS/CSS files instead of optimized bundles
179 | 3. **Mixed hosting strategy:** Inconsistent use of CDNs vs self-hosting
180 | 4. **Outdated dependencies:** Legacy jQuery versions, unused framework components
181 | 5. **Poor directory organization:** Assets without logical folder structure
182 |
183 | ### Low Priority Issues
184 |
185 | 1. **Minor optimization opportunities:** Slightly oversized images, redundant CSS
186 | 2. **Naming convention inconsistencies:** Mixed file naming patterns
187 | 3. **Cache header optimization:** Suboptimal asset caching strategies
188 |
189 | ## Reporting Template
190 |
191 | ### Executive File Type Summary
192 |
193 | | File Type | Internal Count | External Count | Total Count | Primary Hosting | Optimization Status |
194 | |-----------|---------------|----------------|-------------|-----------------|-------------------|
195 | | HTML | X | Y | Z | Main Domain | ✅ Well Organized |
196 | | Images | X | Y | Z | CDN/Mixed | ⚠️ Needs Optimization |
197 | | JavaScript | X | Y | Z | Mixed | ⚠️ Bundle Opportunity |
198 | | CSS | X | Y | Z | Main Domain | ✅ Good Structure |
199 | | Media | X | Y | Z | External | ✅ Proper CDN Use |
200 | | Fonts | X | Y | Z | Google Fonts | ✅ Performance Optimized |
201 | | Other | X | Y | Z | Mixed | ℹ️ Review Needed |
202 |
203 | ### Asset Architecture Health Score
204 |
205 | - **Overall Grade:** [A-F] based on organization and optimization
206 | - **Total Assets:** X files across Y domains
207 | - **Hosting Strategy:** [Optimized | Mixed | Needs Improvement]
208 | - **Performance Impact:** [Low | Medium | High] based on asset distribution
209 |
210 | ### Detailed File Type Analysis
211 |
212 | #### HTML Content Structure
213 |
214 | - **Total Pages:** X HTML files
215 | - **Content Freshness:** Y% updated in last 6 months
216 | - **URL Organization:** [Excellent | Good | Needs Structure]
217 | - **Domain Strategy:** [Single | Multi-subdomain | Complex]
218 |
219 | **Representative URL Patterns:**
220 | - Root pages: `/`, `/about`, `/contact`
221 | - Content sections: `/blog/`, `/products/`, `/docs/`
222 | - Deep content: `/category/subcategory/page/`
223 |
224 | #### Image Asset Distribution
225 |
226 | - **Total Images:** X files (estimated Y MB)
227 | - **Format Breakdown:** Z% JPG, W% PNG, V% SVG
228 | - **Hosting Distribution:** U% internal, T% CDN
229 | - **Optimization Opportunities:** S large files identified
230 |
231 | **Asset Organization Patterns:**
232 | - Well-organized: `/images/category/filename.ext`
233 | - Mixed organization: Various directory structures
234 | - Needs improvement: Files scattered across domains
235 |
236 | #### JavaScript Architecture
237 |
238 | - **Total Scripts:** X files
239 | - **Library Dependencies:** jQuery (Y), React (Z), Analytics (W)
240 | - **Bundle Strategy:** [Optimized | Moderate | Fragmented]
241 | - **Third-party Usage:** V% external dependencies
242 |
243 | **Performance Indicators:**
244 | - Large bundles: Files >100KB identified
245 | - Legacy libraries: Outdated framework versions
246 | - Loading strategy: Async/defer usage analysis
247 |
248 | #### CSS Organization
249 |
250 | - **Total Stylesheets:** X files
251 | - **Framework Usage:** Bootstrap, custom themes identified
252 | - **File Size Distribution:** Largest Y KB, average Z KB
253 | - **Delivery Strategy:** [Optimized | Standard | Needs Work]
254 |
255 | **Architecture Assessment:**
256 | - Modular approach: Well-separated concerns
257 | - Monolithic: Few large files
258 | - Fragmented: Many small files without clear organization
259 |
260 | ### Asset Hosting Strategy Analysis
261 |
262 | #### Domain Performance Matrix
263 |
264 | | Domain Type | Example | Asset Types | Count | Performance Impact | Recommendation |
265 | |-------------|---------|-------------|-------|-------------------|----------------|
266 | | Main Domain | example.com | HTML, some CSS/JS | X | Baseline | Maintain current |
267 | | Asset Subdomain | static.example.com | Images, CSS, JS | Y | Optimized | ✅ Best practice |
268 | | Third-party CDN | cdnjs.cloudflare.com | Libraries | Z | Fast but dependent | Monitor reliability |
269 | | External Services | fonts.googleapis.com | Web fonts | W | Good performance | ✅ Appropriate use |
270 |
271 | #### Priority Matrix
272 |
273 | 1. **Critical (Fix Immediately):** Oversized assets affecting performance, missing critical files
274 | 2. **High (Fix This Sprint):** Legacy formats, asset sprawl, poor CDN utilization
275 | 3. **Medium (Next Quarter):** Bundle optimization, directory organization, format modernization
276 | 4. **Low (Backlog):** Minor optimizations, naming conventions, cache tuning
277 |
278 | ## What's Next?
279 |
280 | Your asset audit reveals optimization opportunities across performance, organization, and maintenance. The biggest wins typically come from addressing oversized assets (images >2MB, JS >500KB), implementing CDN strategies, and consolidating fragmented bundles.
281 |
282 | **Ready to optimize?** I can help you:
283 | - **Prioritize critical fixes** - Focus on your highest-impact performance bottlenecks with specific implementation strategies and expected performance gains
284 | - **Research optimization tools** - Find monitoring, bundling, and CDN solutions that fit your development workflow and technical constraints
285 | - **Plan architecture improvements** - Design sustainable asset organization and delivery strategies for long-term maintainability
286 |
287 | **What would be most helpful for your next steps?**
288 |
289 | ## Methodology
290 |
291 | You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.
292 |
293 | Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/tests.py:
--------------------------------------------------------------------------------
```python
1 | import unittest
2 | from mcp_server_webcrawl.utils.search import SearchQueryParser, SearchSubquery
3 |
4 | class TestSearchQueryParser(unittest.TestCase):
5 |
6 | def setUp(self):
7 | """
8 | Set up a parser instance for each test
9 | """
10 | self.parser = SearchQueryParser()
11 |
12 | def test_simple_term(self):
13 | """
14 | Simple single term search
15 | """
16 | query = "hello"
17 | result: SearchSubquery= self.parser.parse(query)
18 | self.assertEqual(len(result), 1)
19 | self.assertEqual(result[0].field, None)
20 | self.assertEqual(result[0].value, "hello")
21 | self.assertEqual(result[0].type, "term")
22 | self.assertEqual(result[0].operator, None)
23 |
24 | def test_quoted_phrase(self):
25 | """
26 | Quoted phrase search
27 | """
28 | query = '"hello world"'
29 | result: SearchSubquery= self.parser.parse(query)
30 | self.assertEqual(len(result), 1)
31 | self.assertEqual(result[0].field, None)
32 | self.assertEqual(result[0].value, "hello world")
33 | self.assertEqual(result[0].type, "phrase")
34 |
35 | def test_wildcard_term(self):
36 | """
37 | Wildcard term search
38 | """
39 | query = "search*"
40 | result: SearchSubquery= self.parser.parse(query)
41 | self.assertEqual(len(result), 1)
42 | self.assertEqual(result[0].field, None)
43 | self.assertEqual(result[0].value, "search")
44 | self.assertEqual(result[0].type, "wildcard")
45 |
46 | def test_field_term(self):
47 | """
48 | Field-specific term search
49 | """
50 | query = "url:example.com"
51 | result: SearchSubquery= self.parser.parse(query)
52 | self.assertEqual(len(result), 1)
53 | self.assertEqual(result[0].field, "url")
54 | self.assertEqual(result[0].value, "example.com")
55 | self.assertEqual(result[0].type, "term")
56 |
57 | def test_field_numeric(self):
58 | """
59 | Field with numeric value
60 | """
61 | query = "status:404"
62 | result: SearchSubquery= self.parser.parse(query)
63 | self.assertEqual(len(result), 1)
64 | self.assertEqual(result[0].field, "status")
65 | self.assertEqual(result[0].value, 404)
66 | self.assertEqual(result[0].type, "term")
67 |
68 | def test_field_quoted(self):
69 | """
70 | Field with quoted value
71 | """
72 | query = 'content:"hello world"'
73 | result: SearchSubquery= self.parser.parse(query)
74 | self.assertEqual(len(result), 1)
75 | self.assertEqual(result[0].field, "content")
76 | self.assertEqual(result[0].value, "hello world")
77 | self.assertEqual(result[0].type, "phrase")
78 |
79 | def test_field_wildcard(self):
80 | """
81 | Field with wildcard value
82 | """
83 | query = "url:example*"
84 | result: SearchSubquery= self.parser.parse(query)
85 | self.assertEqual(len(result), 1)
86 | self.assertEqual(result[0].field, "url")
87 | self.assertEqual(result[0].value, "example")
88 | self.assertEqual(result[0].type, "wildcard")
89 |
90 | def test_simple_and(self):
91 | """
92 | Simple AND query
93 | """
94 | query = "hello AND world"
95 | result: SearchSubquery= self.parser.parse(query)
96 | self.assertEqual(len(result), 2)
97 | self.assertEqual(result[0].value, "hello")
98 | self.assertEqual(result[0].operator, "AND")
99 | self.assertEqual(result[1].value, "world")
100 | self.assertEqual(result[1].operator, None)
101 |
102 | def test_simple_or(self):
103 | """
104 | Simple OR query
105 | """
106 | query = "hello OR world"
107 | result: SearchSubquery= self.parser.parse(query)
108 | self.assertEqual(len(result), 2)
109 | self.assertEqual(result[0].value, "hello")
110 | self.assertEqual(result[0].operator, "OR")
111 | self.assertEqual(result[1].value, "world")
112 | self.assertEqual(result[1].operator, None)
113 |
114 | def test_simple_not(self):
115 | """
116 | Simple NOT query
117 | """
118 | query = "NOT hello"
119 | result: SearchSubquery= self.parser.parse(query)
120 | self.assertEqual(len(result), 1)
121 | self.assertEqual(result[0].value, "hello")
122 | self.assertTrue('NOT' in result[0].modifiers)
123 |
124 | def test_and_with_fields(self):
125 | """
126 | AND with field specifiers
127 | """
128 | query = "content:hello AND url:example.com"
129 | result: SearchSubquery= self.parser.parse(query)
130 | self.assertEqual(len(result), 2)
131 | self.assertEqual(result[0].field, "content")
132 | self.assertEqual(result[0].operator, "AND")
133 | self.assertEqual(result[1].field, "url")
134 |
135 | def test_or_with_fields(self):
136 | """
137 | OR with field specifiers
138 | """
139 | query = "status:404 OR status:500"
140 | result: SearchSubquery= self.parser.parse(query)
141 | self.assertEqual(len(result), 2)
142 | self.assertEqual(result[0].field, "status")
143 | self.assertEqual(result[0].value, 404)
144 | self.assertEqual(result[0].operator, "OR")
145 | self.assertEqual(result[1].field, "status")
146 | self.assertEqual(result[1].value, 500)
147 |
148 | def test_not_with_field(self):
149 | """
150 | NOT with field specifier
151 | """
152 | query = "NOT status:404"
153 | result: SearchSubquery= self.parser.parse(query)
154 | self.assertEqual(len(result), 1)
155 | self.assertEqual(result[0].field, "status")
156 | self.assertEqual(result[0].value, 404)
157 | self.assertTrue('NOT' in result[0].modifiers)
158 |
159 | def test_simple_parentheses(self):
160 | """
161 | Simple expression with parentheses
162 | """
163 | query = "(hello AND world)"
164 | result: SearchSubquery= self.parser.parse(query)
165 | self.assertEqual(len(result), 2)
166 | self.assertEqual(result[0].value, "hello")
167 | self.assertEqual(result[0].operator, "AND")
168 | self.assertEqual(result[1].value, "world")
169 | self.assertEqual(result[1].operator, None)
170 |
171 | def test_complex_parentheses(self):
172 | """
173 | Complex expression with nested parentheses
174 | """
175 | query = "(hello AND (world OR planet))"
176 | result: SearchSubquery= self.parser.parse(query)
177 | self.assertEqual(len(result), 3)
178 | self.assertEqual(result[0].value, "hello")
179 | self.assertEqual(result[0].operator, "AND")
180 | self.assertEqual(result[1].value, "world")
181 | self.assertEqual(result[1].operator, "OR")
182 | self.assertEqual(result[2].value, "planet")
183 | self.assertEqual(result[2].operator, None)
184 |
185 | def test_mixed_operators(self):
186 | """
187 | Query with mixed operators
188 | """
189 | query = "hello AND world OR planet"
190 | result: SearchSubquery= self.parser.parse(query)
191 | self.assertEqual(len(result), 3)
192 | self.assertEqual(result[0].value, "hello")
193 | self.assertEqual(result[0].operator, "AND")
194 | self.assertEqual(result[1].value, "world")
195 | self.assertEqual(result[1].operator, "OR")
196 | self.assertEqual(result[2].value, "planet")
197 | self.assertEqual(result[2].operator, None)
198 |
199 | def test_mixed_with_parentheses(self):
200 | """
201 | Mixed operators with parentheses for precedence
202 | """
203 | query = "hello AND (world OR planet)"
204 | result: SearchSubquery= self.parser.parse(query)
205 | self.assertEqual(len(result), 3)
206 | self.assertEqual(result[0].value, "hello")
207 | self.assertEqual(result[0].operator, "AND")
208 | self.assertEqual(result[1].value, "world")
209 | self.assertEqual(result[1].operator, "OR")
210 | self.assertEqual(result[2].value, "planet")
211 | self.assertEqual(result[2].operator, None)
212 |
213 | def test_complex_nested_query(self):
214 | """
215 | Complex nested query with multiple operators
216 | """
217 | query = '(content:"error message" AND (status:404 OR status:500)) AND NOT url:example.com'
218 | result: SearchSubquery= self.parser.parse(query)
219 | self.assertEqual(len(result), 4)
220 | self.assertEqual(result[0].field, "content")
221 | self.assertEqual(result[0].value, "error message")
222 | self.assertEqual(result[0].operator, "AND")
223 | self.assertEqual(result[1].field, "status")
224 | self.assertEqual(result[1].value, 404)
225 | self.assertEqual(result[1].operator, "OR")
226 | self.assertEqual(result[2].field, "status")
227 | self.assertEqual(result[2].value, 500)
228 | self.assertEqual(result[2].operator, "NOT")
229 | self.assertEqual(result[3].field, "url")
230 | self.assertEqual(result[3].value, "example.com")
231 | self.assertEqual(result[3].operator, None)
232 |
233 | def test_all_features_combined(self):
234 | """
235 | Comprehensive test with all features combined
236 | """
237 | query = 'content:"critical error" AND (status:500 OR type:html) AND NOT url:example* AND size:1024'
238 | result: SearchSubquery= self.parser.parse(query)
239 | self.assertEqual(len(result), 5)
240 | self.assertEqual(result[0].field, "content")
241 | self.assertEqual(result[0].value, "critical error")
242 | self.assertEqual(result[0].type, "phrase")
243 | self.assertEqual(result[0].operator, "AND")
244 | self.assertEqual(result[1].field, "status")
245 | self.assertEqual(result[1].value, 500)
246 | self.assertEqual(result[1].operator, "OR")
247 | self.assertEqual(result[2].field, "type")
248 | self.assertEqual(result[2].value, "html")
249 | self.assertEqual(result[2].operator, "AND")
250 | self.assertEqual(result[3].field, "url")
251 | self.assertEqual(result[3].value, "example")
252 | self.assertEqual(result[3].type, "wildcard")
253 | self.assertTrue('NOT' in result[3].modifiers)
254 | self.assertEqual(result[3].operator, "AND")
255 | self.assertEqual(result[4].field, "size")
256 | self.assertEqual(result[4].value, 1024)
257 | self.assertEqual(result[4].operator, None)
258 |
259 | def test_to_sqlite_fts(self):
260 | """
261 | Test conversion to SQLite FTS format
262 | """
263 | query = 'content:"error" AND status:404'
264 | result: SearchSubquery= self.parser.parse(query)
265 |
266 | query_parts, params = self.parser.to_sqlite_fts(result)
267 |
268 | self.assertEqual(len(query_parts), 3)
269 | self.assertEqual(query_parts[0], "ResourcesFullText.Content MATCH :query0")
270 | self.assertEqual(query_parts[1], "AND")
271 | self.assertEqual(query_parts[2], "Resources.Status = :query1")
272 |
273 | self.assertEqual(len(params), 2)
274 | self.assertEqual(params["query0"], '"error"')
275 | self.assertEqual(params["query1"], 404)
276 |
277 | def test_operator_assignment_bug(self):
278 | """
279 | Test that exposes the double operator assignment bug.
280 | Query: "term1 AND term2 OR term3" should create:
281 | [term1(op=AND), term2(op=OR), term3(op=None)]
282 |
283 | Were the bug present, term3 would incorrectly get operator == OR
284 | """
285 | from mcp_server_webcrawl.utils.parser import SearchLexer, SearchParser
286 |
287 | lexer = SearchLexer()
288 | parser = SearchParser(lexer)
289 | query = "term1 AND term2 OR term3"
290 | result = parser.parser.parse(query, lexer=lexer.lexer)
291 | self.assertEqual(len(result), 3)
292 | self.assertEqual(result[0].value, "term1")
293 | self.assertEqual(result[0].operator, "AND")
294 | self.assertEqual(result[1].value, "term2")
295 | self.assertEqual(result[1].operator, "OR")
296 | self.assertEqual(result[2].value, "term3")
297 | self.assertEqual(result[2].operator, None)
298 |
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../genindex.html" />
20 | <link rel="search" title="Search" href="../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.crawlers</h1><div class="highlight"><pre>
77 | <span></span>
78 | <span class="kn">import</span> <span class="nn">sys</span>
79 | <span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.settings</span> <span class="kn">import</span> <span class="n">FIXTURES_DIRECTORY</span>
81 |
82 | <span class="n">VALID_CRAWLER_CHOICES</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"archivebox"</span><span class="p">,</span> <span class="s2">"httrack"</span><span class="p">,</span> <span class="s2">"interrobot"</span><span class="p">,</span> <span class="s2">"katana"</span><span class="p">,</span> <span class="s2">"siteone"</span><span class="p">,</span> <span class="s2">"warc"</span><span class="p">,</span> <span class="s2">"wget"</span><span class="p">]</span>
83 |
84 | <div class="viewcode-block" id="get_fixture_directory">
85 | <a class="viewcode-back" href="../../mcp_server_webcrawl.crawlers.html#mcp_server_webcrawl.crawlers.get_fixture_directory">[docs]</a>
86 | <span class="k">def</span> <span class="nf">get_fixture_directory</span><span class="p">()</span> <span class="o">-></span> <span class="n">Path</span><span class="p">:</span>
87 | <span class="c1"># only to be used for devs on test runs, configured in settings_local.py</span>
88 | <span class="c1"># settings_local.py added as sibling of settings.py if not present</span>
89 | <span class="c1"># download https://github.com/pragmar/mcp-server-webcrawl-fixtures</span>
90 | <span class="k">assert</span> <span class="n">FIXTURES_DIRECTORY</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">FIXTURES_DIRECTORY</span><span class="o">.</span><span class="n">is_dir</span><span class="p">(),</span> \
91 | <span class="sa">f</span><span class="s2">"Fixtures not configured in settings_local.py, or is not a valid directory.</span><span class="se">\n</span><span class="s2">FIXTURES_DIRECTORY: </span><span class="si">{</span><span class="n">FIXTURES_DIRECTORY</span><span class="si">}</span><span class="s2">"</span>
92 |
93 | <span class="k">return</span> <span class="n">FIXTURES_DIRECTORY</span></div>
94 |
95 |
96 | <div class="viewcode-block" id="get_crawler">
97 | <a class="viewcode-back" href="../../mcp_server_webcrawl.crawlers.html#mcp_server_webcrawl.crawlers.get_crawler">[docs]</a>
98 | <span class="k">def</span> <span class="nf">get_crawler</span><span class="p">(</span><span class="n">crawler_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
99 | <span class="w"> </span><span class="sd">"""</span>
100 | <span class="sd"> lazy load crawler, some classes have additional package dependencies</span>
101 | <span class="sd"> """</span>
102 | <span class="k">if</span> <span class="n">crawler_name</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
103 | <span class="k">return</span> <span class="kc">None</span>
104 | <span class="n">crawler_name</span> <span class="o">=</span> <span class="n">crawler_name</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
105 | <span class="k">if</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"archivebox"</span><span class="p">:</span>
106 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.archivebox.crawler</span> <span class="kn">import</span> <span class="n">ArchiveBoxCrawler</span>
107 | <span class="k">return</span> <span class="n">ArchiveBoxCrawler</span>
108 | <span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"httrack"</span><span class="p">:</span>
109 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.httrack.crawler</span> <span class="kn">import</span> <span class="n">HtTrackCrawler</span>
110 | <span class="k">return</span> <span class="n">HtTrackCrawler</span>
111 | <span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"interrobot"</span><span class="p">:</span>
112 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.interrobot.crawler</span> <span class="kn">import</span> <span class="n">InterroBotCrawler</span>
113 | <span class="k">return</span> <span class="n">InterroBotCrawler</span>
114 | <span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"katana"</span><span class="p">:</span>
115 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.crawler</span> <span class="kn">import</span> <span class="n">KatanaCrawler</span>
116 | <span class="k">return</span> <span class="n">KatanaCrawler</span>
117 | <span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"siteone"</span><span class="p">:</span>
118 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.siteone.crawler</span> <span class="kn">import</span> <span class="n">SiteOneCrawler</span>
119 | <span class="k">return</span> <span class="n">SiteOneCrawler</span>
120 | <span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"warc"</span><span class="p">:</span>
121 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.warc.crawler</span> <span class="kn">import</span> <span class="n">WarcCrawler</span>
122 | <span class="k">return</span> <span class="n">WarcCrawler</span>
123 | <span class="k">elif</span> <span class="n">crawler_name</span> <span class="o">==</span> <span class="s2">"wget"</span><span class="p">:</span>
124 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
125 | <span class="k">return</span> <span class="n">WgetCrawler</span>
126 | <span class="k">else</span><span class="p">:</span>
127 | <span class="n">valid_choices</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">VALID_CRAWLER_CHOICES</span><span class="p">)</span>
128 | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">"unsupported crawler '</span><span class="si">{</span><span class="n">crawler_name</span><span class="si">}</span><span class="s2">' (</span><span class="si">{</span><span class="n">valid_choices</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span></div>
129 |
130 | </pre></div>
131 |
132 | </div>
133 | </div>
134 | <footer>
135 |
136 | <hr/>
137 |
138 | <div role="contentinfo">
139 | <p>© Copyright 2025, pragmar.</p>
140 | </div>
141 |
142 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
143 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
144 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
145 |
146 |
147 | </footer>
148 | </div>
149 | </div>
150 | </section>
151 | </div>
152 | <script>
153 | jQuery(function () {
154 | SphinxRtdTheme.Navigation.enable(true);
155 | });
156 | </script>
157 |
158 | </body>
159 | </html>
```
--------------------------------------------------------------------------------
/prompts/auditperf.md:
--------------------------------------------------------------------------------
```markdown
1 | # Web Performance Detective
2 |
3 | ## Query Sequence
4 |
5 | ### 1. Identify Target Domain & Homepage
6 |
7 | **FIRST:** Get available sites and let user choose:
8 | ```
9 | webcrawl_sites() - get all available domains
10 | ```
11 |
12 | **THEN:** Find homepage with sorted URL approach:
13 | ```
14 | query: type: html AND url: [target_site_domain]
15 | limit: 1
16 | sites: [target_site_id]
17 | fields: ["content"]
18 | sort: +url
19 | ```
20 |
21 | **Extract exact domain** from homepage URL for filtering (e.g., `example.com`)
22 |
23 | **Parse homepage for embedded assets:**
24 | - `<style>` blocks (inline CSS)
25 | - `<script>` blocks (inline JS)
26 | - `<link rel="stylesheet">` references
27 | - `<script src="">` references
28 | - CSS `@import` statements
29 | - Performance-critical patterns in HTML
30 |
31 | ### 2. Site-Wide CSS Analysis
32 |
33 | **Primary CSS Query:**
34 | ```
35 | query: type: style AND url: [target_site_domain]
36 | limit: 100
37 | sites: [target_site_id]
38 | fields: []
39 | sort: +url
40 | ```
41 |
42 | **If 0 results, try Asset Domain Discovery:**
43 | ```
44 | query: type: style
45 | limit: 100
46 | sites: [target_site_id]
47 | fields: []
48 | sort: +url
49 | ```
50 |
51 | **Then filter results for common asset domain patterns:**
52 | - `static.[domain]` (static.nasa.gov)
53 | - `cdn.[domain]` (cdn.nasa.gov)
54 | - `assets.[domain]` (assets.nasa.gov)
55 | - `media.[domain]` (media.nasa.gov)
56 | - `img.[domain]` or `images.[domain]`
57 | - `js.[domain]` or `css.[domain]`
58 | - `files.[domain]` or `downloads.[domain]`
59 | - Third-party CDNs (cdnjs.cloudflare.com, jsdelivr.net, unpkg.com)
60 |
61 | **Asset Domain Detection Logic:**
62 | 1. Extract all unique domains from CSS file URLs
63 | 2. Score domains by asset-hosting likelihood:
64 | - Contains "static", "cdn", "assets", "media" = High
65 | - Subdomain of main site = Medium
66 | - Third-party CDN = Medium
67 | - Same as main domain = Low (already checked)
68 | 3. Use highest-scoring domain as asset domain
69 |
70 | **Analyze each CSS file for:**
71 | - `@import` usage (render-blocking)
72 | - `!important` overuse (specificity issues)
73 | - Unused vendor prefixes
74 | - Large file sizes
75 | - Duplicate rules across files
76 |
77 | ### 3. Site-Wide JavaScript Analysis
78 |
79 | **Primary JS Query:**
80 | ```
81 | query: type: script AND url: [target_site_domain]
82 | limit: 100
83 | sites: [target_site_id]
84 | fields: []
85 | sort: +url
86 | ```
87 |
88 | **If 0 results, use discovered asset domain:**
89 | ```
90 | query: type: script AND url: [discovered_asset_domain]
91 | limit: 100
92 | sites: [target_site_id]
93 | fields: []
94 | sort: +url
95 | ```
96 |
97 | **If still 0 results, broad asset discovery:**
98 | ```
99 | query: type: script
100 | limit: 100
101 | sites: [target_site_id]
102 | fields: []
103 | sort: +url
104 | ```
105 |
106 | **Then apply same domain filtering logic as CSS**
107 |
108 | **Analyze each JS file for:**
109 | - `document.getElementById` (inefficient DOM queries)
110 | - jQuery usage patterns
111 | - Blocking script patterns
112 | - Large library imports
113 | - Performance anti-patterns
114 |
115 | ### 4. Cross-Reference & Dependency Mapping
116 |
117 | **Find render-blocking resources:**
118 | ```
119 | query: type: html AND (content: "<link rel=\"stylesheet\"" OR content: "<script src=") AND url: [target_site_domain]
120 | limit: 50
121 | sites: [target_site_id]
122 | fields: []
123 | sort: +url
124 | ```
125 |
126 | **Map critical rendering path dependencies**
127 |
128 | ### 5. Offer Advanced Analysis or Tool Research
129 |
130 | After completing the main audit report, offer the user two additional options:
131 | - **Detailed Analysis:** More comprehensive investigation of specific performance bottlenecks, asset optimization opportunities, or advanced performance patterns
132 | - **Tool Research:** Research and recommend specific tools to address identified performance issues and implement monitoring solutions
133 |
134 | ## Performance Anti-Pattern Detection
135 |
136 | ### CSS Performance Issues
137 |
138 | #### Critical Render-Blocking Patterns
139 |
140 | - **@import usage:** Delays CSS parsing
141 | - **Excessive !important:** Indicates poor CSS architecture
142 | - **Large CSS files:** >100KB uncompressed
143 | - **Unused CSS:** High selector count vs actual usage
144 | - **CSS-in-JS:** React/Vue component styles
145 |
146 | #### Specificity & Maintainability Issues
147 |
148 | - **ID selectors overuse:** High specificity conflicts
149 | - **Deep nesting:** >4 levels indicates complexity
150 | - **Vendor prefix bloat:** Outdated browser support
151 | - **Duplicate declarations:** Maintenance overhead
152 |
153 | ### JavaScript Performance Issues
154 |
155 | #### DOM Manipulation Anti-Patterns
156 |
157 | - **document.getElementById in loops:** Performance killer
158 | - **jQuery chaining abuse:** Memory leaks potential
159 | - **No event delegation:** Too many event listeners
160 | - **Synchronous AJAX:** Blocking user interaction
161 |
162 | #### Loading & Execution Issues
163 |
164 | - **Blocking scripts in `<head>`:** Delays page rendering
165 | - **Large library imports:** jQuery, Lodash entire libraries
166 | - **Polyfill overuse:** Unnecessary for modern browsers
167 | - **No async/defer attributes:** Blocking HTML parsing
168 |
169 | ## Asset Segmentation Strategy
170 |
171 | ### Asset Domain Classification
172 |
173 | #### Main Domain Assets
174 |
175 | **Scope:** `[target_site_domain]` - same domain as website
176 | - Self-hosted assets on primary domain
177 | - Often includes basic CSS/JS for small sites
178 |
179 | #### Asset Domain Assets
180 |
181 | **Scope:** `static.[domain]`, `cdn.[domain]`, `assets.[domain]`, etc.
182 | - Dedicated asset subdomains for performance
183 | - Usually contains bulk of CSS/JS files
184 | - Better caching and CDN optimization
185 |
186 | #### Third-Party Assets
187 |
188 | **Scope:** External CDNs and services
189 | - `cdnjs.cloudflare.com`, `jsdelivr.net`, `unpkg.com`
190 | - Google Fonts, jQuery CDN, Bootstrap CDN
191 | - Analytics, tracking, and widget scripts
192 |
193 | #### Asset Discovery Strategy
194 |
195 | 1. **Primary search:** Target main domain first
196 | 2. **Asset domain discovery:** If 0 results, scan all domains for asset patterns
197 | 3. **Domain scoring:** Rank by likelihood of hosting assets
198 | 4. **Fallback analysis:** Use highest-scoring asset domain
199 |
200 | ### Homepage-Specific Assets
201 |
202 | **Scope:** Assets only loaded on homepage
203 | - **Inline styles:** `<style>` blocks in homepage HTML
204 | - **Inline scripts:** `<script>` blocks in homepage HTML
205 | - **Homepage-only CSS:** Files referenced only by homepage
206 | - **Homepage-only JS:** Files referenced only by homepage
207 |
208 | **Analysis Focus:**
209 | - Critical CSS identification
210 | - Above-the-fold optimization
211 | - Homepage-specific performance bottlenecks
212 |
213 | ### Site-Global Assets
214 |
215 | **Scope:** Assets loaded across multiple pages (any domain)
216 | - **Global stylesheets:** Referenced by >1 page
217 | - **Framework CSS:** Bootstrap, Foundation, custom frameworks
218 | - **Global JavaScript:** Site-wide functionality
219 | - **Third-party libraries:** Analytics, tracking, widgets
220 |
221 | **Analysis Focus:**
222 | - Caching optimization opportunities
223 | - Bundle size optimization
224 | - Progressive loading strategies
225 |
226 | ### Page-Type Specific Assets
227 |
228 | **Scope:** Assets for specific page categories
229 | - **Blog-specific:** Article styling, commenting systems
230 | - **Gallery-specific:** Image viewers, slideshow libraries
231 | - **Form-specific:** Validation libraries, UI components
232 |
233 | ## Common Performance Issues
234 |
235 | ### High Priority Issues
236 |
237 | 1. **Render-blocking CSS/JS:** Assets that delay initial page rendering
238 | 2. **@import usage:** CSS imports that create dependency chains
239 | 3. **Synchronous JavaScript:** Blocking scripts that prevent HTML parsing
240 | 4. **Oversized assets:** CSS >200KB or JS >500KB affecting load times
241 | 5. **Missing async/defer:** JavaScript without proper loading attributes
242 |
243 | ### Medium Priority Issues
244 |
245 | 1. **jQuery dependency:** Legacy library usage for simple DOM operations
246 | 2. **Unused CSS/JS:** Large files with low utilization rates
247 | 3. **Vendor prefix bloat:** Outdated browser support adding file size
248 | 4. **Inefficient DOM queries:** Performance-killing selection patterns
249 | 5. **Missing CDN usage:** Large assets served from main domain
250 |
251 | ### Low Priority Issues
252 |
253 | 1. **CSS specificity wars:** Excessive !important usage indicating architectural issues
254 | 2. **Minor bundle optimization:** Small files that could be combined
255 | 3. **Cache optimization opportunities:** Suboptimal asset caching strategies
256 |
257 | ## Reporting Template
258 |
259 | ### 📊 Executive Performance Summary
260 |
261 | | Metric | Value | Status |
262 | |--------|-------|--------|
263 | | **Total Assets Analyzed** | X CSS, Y JS files | Based on crawl results |
264 | | **Critical Issues** | X render-blocking resources | Immediate attention needed |
265 | | **Optimization Potential** | Estimated X% improvement | Conservative estimate |
266 | | **Performance Grade** | [A-F] | Based on issue severity |
267 |
268 | ### 🏗️ Asset Distribution Analysis
269 |
270 | | Domain Type | Example | Asset Count | Total Size | Performance Notes |
271 | |-------------|---------|-------------|------------|-------------------|
272 | | Main Domain | example.com | X CSS, Y JS | Z KB | Self-hosted control |
273 | | Asset Subdomain | static.example.com | X CSS, Y JS | Z KB | Optimized delivery |
274 | | Third-party CDN | cdnjs.cloudflare.com | X CSS, Y JS | Z KB | External dependency |
275 | | **Totals** | - | **X CSS, Y JS** | **Z KB** | **[Strategy Assessment]** |
276 |
277 | ### ⚡ Critical Rendering Path Analysis
278 |
279 | | Asset Type | Domain | Count | Size | Blocking Impact | Priority |
280 | |------------|--------|-------|------|-----------------|----------|
281 | | Inline CSS | Main | X | Y KB | @import usage | High |
282 | | External CSS | Asset/CDN | X | Y KB | Render-blocking | High |
283 | | Inline JS | Main | X | Y KB | DOM queries | Medium |
284 | | External JS | Asset/CDN | X | Y KB | jQuery patterns | Medium |
285 |
286 | ### 🔍 Performance Anti-Pattern Detection
287 |
288 | | Issue Type | Occurrences | Impact Level | Root Cause |
289 | |------------|-------------|--------------|------------|
290 | | @import Usage | X files | Critical | CSS dependency chains |
291 | | Blocking Scripts | Y files | High | Missing async/defer |
292 | | Oversized Assets | Z files | Medium | Bundle optimization needed |
293 | | jQuery Dependencies | W files | Low | Legacy library usage |
294 |
295 | ### 🎯 Asset Architecture Health
296 |
297 | | Metric | Current | Benchmark | Status |
298 | |--------|---------|-----------|--------|
299 | | Total CSS Size | X KB | <200KB | ✅/⚠️/❌ |
300 | | Total JS Size | Y KB | <500KB | ✅/⚠️/❌ |
301 | | Asset Domains | Z domains | 2-3 optimal | ✅/⚠️/❌ |
302 | | Render Blockers | W resources | <3 critical | ✅/⚠️/❌ |
303 |
304 | ## What's Next?
305 |
306 | Your performance audit reveals the current state of your asset delivery strategy and identifies the biggest opportunities for improvement. Whether you're dealing with render-blocking resources, oversized bundles, or architectural complexity, most performance gains come from addressing the highest-impact patterns first.
307 |
308 | **Ready to optimize?** I can help you:
309 | - **Focus on critical fixes** - Let's prioritize your specific performance bottlenecks and create detailed optimization strategies, including implementation approaches and expected performance gains
310 | - **Expand the technical analysis** - Examine dependency chains, analyze Core Web Vitals impact, or investigate advanced optimization techniques like critical CSS extraction and progressive loading
311 | - **Research performance tools** - Find the right monitoring, bundling, and optimization solutions that fit your development workflow and technical constraints
312 |
313 | **What would be most helpful for your next steps?**
314 |
315 | ## Methodology
316 |
317 | You will review this web project from the perspective of an accomplished but patient web developer. You've seen it all over the years, and have reasonable expectations of quality. At the same time you have a fondness for the user wanting to improve the web at all. It's a noble pursuit that you can encourage without being overbearing. Nobody wants a scolding or patronizing AI. It's a fine line to walk, but you somehow manage it well. As these "reviews" can be hard to see, you will break news gently, but firmly when things are out of whack.
318 |
319 | Where you have tabular data, you aren't afraid to arrange it in an aesthetically pleasing manner. You will prefer tables above unordered lists. Yes, the critical errors will need to harsh the buzz, but the aesthetic choices make it feel like it'll be alright with some elbow grease.
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/help.py:
--------------------------------------------------------------------------------
```python
1 | import curses
2 | import textwrap
3 |
4 | from typing import TYPE_CHECKING
5 |
6 | from mcp_server_webcrawl.interactive.views.base import CONTENT_MARGIN
7 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView
8 | from mcp_server_webcrawl.interactive.ui import ThemeDefinition
9 | from mcp_server_webcrawl.interactive.ui import safe_addstr
10 | if TYPE_CHECKING:
11 | from mcp_server_webcrawl.interactive.session import InteractiveSession
12 |
13 | INTERROBOT_LINK: str = "<https://interro.bot>"
14 | HELP_CONTENT: str = """Boolean Search Syntax
15 |
16 | The query engine supports field-specific (`field: value`) searches and complex boolean expressions. Fulltext is supported as a combination of the url, content, and headers fields.
17 |
18 | Example Queries
19 |
20 | | Query Example | Description |
21 | |------------------------------|---------------------------------------|
22 | | privacy | Fulltext single keyword match |
23 | | "privacy policy" | Fulltext exact phrase match |
24 | | boundar* | Fulltext wildcard (boundary, |
25 | | | boundaries, etc.) |
26 | | id: 12345 | Match specific resource by ID |
27 | | url: example.com/dir | URL contains example.com/dir |
28 | | type: html | HTML pages only |
29 | | status: 200 | HTTP status equals 200 |
30 | | status: >=400 | HTTP status >= 400 |
31 | | content: h1 | Content contains h1 |
32 | | headers: text/xml | Headers contain text/xml |
33 | | privacy AND policy | Fulltext matches both terms |
34 | | privacy OR policy | Fulltext matches either term |
35 | | policy NOT privacy | Policy but not privacy |
36 | | (login OR signin) AND form | Login/signin with form |
37 | | type: html AND status: 200 | HTML pages with HTTP success |
38 |
39 | Field Reference
40 |
41 | `id`: Resource identifier (integer)
42 | - Example: id: 12345
43 |
44 | `url`: URL field matching
45 | - Supports partial matches and wildcards
46 | - Example: `url: example.com/about`
47 | - Example: `url: *.pdf`
48 |
49 | `type`: Resource type filtering
50 | - Common types: html, img, script, style, font, audio, video, pdf, doc
51 | - Example: `type: html`
52 | - Example: `type: img`
53 |
54 | `status`: HTTP status code
55 | - Supports exact matches and comparisons
56 | - Example: `status: 200`
57 | - Example: `status: >=400`
58 | - Example: `status: <300`
59 |
60 | `content`: Full-text search within resource content
61 | - Searches the actual content/body of resources
62 | - Example: `content: "user login"`
63 | - Example: `content: javascript`
64 |
65 | `headers`: HTTP response headers search
66 | - Searches within response headers
67 | - Example: `headers: application/json`
68 | - Example: `headers: gzip`
69 |
70 | Boolean Operators
71 |
72 | `AND`: Both terms must be present
73 | - Example: `privacy AND policy`
74 | - Example: `type: html AND status: 200`
75 |
76 | `OR`: Either term can be present
77 | - Example: `login OR signin`
78 | - Example: `type: img OR type: video`
79 |
80 | `NOT`: Exclude documents containing the term
81 | - Example: `policy NOT privacy`
82 | - Example: `type: html NOT status: 404`
83 |
84 | `Parentheses`: Group expressions
85 | - Example: `(login OR signin) AND (form OR page)`
86 | - Example: `type: html AND (status: 200 OR status: 301)`
87 |
88 | Wildcards
89 |
90 | `Suffix wildcard` (*): Matches terms starting with the prefix
91 | - Example: `admin*` matches admin, administrator, administration
92 | - Example: `java*` matches java, javascript, javadoc
93 |
94 | Tips
95 |
96 | - Use quotes for exact phrase matching: `"privacy policy"`
97 | - Combine field searches with fulltext: `type: html AND privacy`
98 | - Use wildcards for partial matches: `admin*`
99 | - Group complex expressions with parentheses
100 | - Field names are case-sensitive, values are case-insensitive
101 | - Whitespace around operators is optional: `A AND B` = `A AND B`
102 |
103 | If you enjoy mcp-server-webcrawl --interactive, you will almost assuredly appreciate InterroBot crawler and analyzer <https://interro.bot>, by the same developer."""
104 |
105 |
106 | class HelpView(BaseCursesView):
107 | """
108 | Interactive help view displaying scrollable documentation.
109 | """
110 |
111 | def __init__(self, session: 'InteractiveSession'):
112 | """
113 | Initialize the help view.
114 |
115 | Args:
116 | session: The interactive session instance
117 | """
118 | super().__init__(session)
119 | self._focused = True
120 | self.__scroll_offset: int = 0
121 | self.__cached_content_lines: list[str] | None = None
122 |
123 | def draw_inner_footer(self, stdscr: curses.window, bounds, text: str) -> None:
124 | """
125 | Draw footer with scroll position information.
126 |
127 | Args:
128 | stdscr: The curses window to draw on
129 | bounds: The view bounds defining the drawing area
130 | text: The footer text to display
131 | """
132 | if not self._focused:
133 | super().draw_inner_footer(stdscr, bounds, text)
134 | return
135 |
136 | content_lines: list[str] = self.__get_content_lines()
137 | content_height: int = max(0, bounds.height - 4)
138 | total_lines: int = len(content_lines)
139 |
140 | if total_lines == 0:
141 | super().draw_inner_footer(stdscr, bounds, text)
142 | return
143 |
144 | showing_start: int = self.__scroll_offset + 1
145 | showing_end: int = min(total_lines, self.__scroll_offset + content_height)
146 |
147 | footer_text: str = f"Viewing lines {showing_start}-{showing_end} of {total_lines}"
148 |
149 | footer_y: int = bounds.y + bounds.height - 1
150 | try:
151 | safe_addstr(stdscr, footer_y, 0, self._get_bounded_line(), self._get_inner_header_style())
152 | safe_addstr(stdscr, footer_y, 1, footer_text, self._get_inner_header_style())
153 | except curses.error:
154 | pass
155 |
156 | def handle_input(self, key: int) -> bool:
157 | """
158 | Handle document navigation input.
159 |
160 | Args:
161 | key: The curses key code from user input
162 |
163 | Returns:
164 | bool: True if the input was handled, False otherwise
165 | """
166 | if not self._focused:
167 | return False
168 |
169 | handlers: dict[int, callable] = {
170 | curses.KEY_UP: self.__scroll_up,
171 | curses.KEY_DOWN: self.__scroll_down,
172 | curses.KEY_PPAGE: lambda: self.__scroll_page_up(max(1, self.bounds.height - 4)),
173 | curses.KEY_NPAGE: lambda: self.__scroll_page_down(max(1, self.bounds.height - 4)),
174 | curses.KEY_HOME: self.__scroll_to_top,
175 | curses.KEY_END: self.__scroll_to_bottom,
176 | }
177 |
178 | handler = handlers.get(key)
179 | if handler:
180 | handler()
181 | return True
182 |
183 | return False
184 |
185 | def render(self, stdscr: curses.window) -> None:
186 | """
187 | Render the help content as a scrollable document.
188 |
189 | Args:
190 | stdscr: The curses window to draw on
191 | """
192 | if not self._renderable(stdscr):
193 | return
194 |
195 | y_current: int = self.bounds.y + 2
196 | y_max: int = self.bounds.y + self.bounds.height - 1
197 | content_height: int = max(0, self.bounds.height - 4)
198 | content_width: int = self.bounds.width - 4
199 | content_lines: list[str] = self.__get_content_lines()
200 | visible_lines: list[str] = content_lines[self.__scroll_offset: self.__scroll_offset + content_height]
201 |
202 | for i, line in enumerate(visible_lines):
203 |
204 | line_y: int = y_current + i
205 | if line_y >= y_max:
206 | break
207 |
208 | display_line: str = line[:content_width] if len(line) > content_width else line
209 | display_line_is_bold: bool = line.startswith('##') or (line.startswith('**') and line.endswith('**') and len(line) > 4)
210 | default_line_style = curses.A_BOLD if display_line_is_bold else curses.A_NORMAL
211 | if INTERROBOT_LINK in line:
212 | link_index = line.index(INTERROBOT_LINK)
213 | safe_addstr(stdscr, line_y, 2, display_line, curses.A_NORMAL)
214 | safe_addstr(stdscr, line_y, 2 + link_index, INTERROBOT_LINK, self.session.get_theme_color_pair(ThemeDefinition.HELP_LINK))
215 | else:
216 | safe_addstr(stdscr, line_y, 2, display_line, default_line_style)
217 |
218 | def __calculate_max_scroll(self) -> int:
219 | """
220 | Calculate maximum scroll offset based on content and view size.
221 |
222 | Returns:
223 | int: The maximum scroll offset value
224 | """
225 | content_lines: list[str] = self.__get_content_lines()
226 | content_height: int = max(0, self.bounds.height - 4)
227 | return max(0, len(content_lines) - content_height)
228 |
229 | def __get_content_lines(self) -> list[str]:
230 | """
231 | Get wrapped content lines with caching.
232 |
233 | Returns:
234 | list[str]: The wrapped and cached content lines
235 | """
236 | if self.__cached_content_lines is not None:
237 | return self.__cached_content_lines
238 |
239 | content_width: int = max(20, self.bounds.width - CONTENT_MARGIN)
240 | wrapped_lines: list[str] = []
241 | text_lines: list[str] = HELP_CONTENT.split("\n")
242 | for line in text_lines:
243 | if not line.strip():
244 | wrapped_lines.append("")
245 | else:
246 | if (line.startswith('|') or
247 | line.startswith('##') or
248 | (line.startswith('**') and line.endswith('**'))):
249 | wrapped_lines.append(line.rstrip())
250 | else:
251 | wrapped: str = textwrap.fill(
252 | line.rstrip(),
253 | width=content_width,
254 | expand_tabs=True,
255 | replace_whitespace=True,
256 | break_long_words=True,
257 | break_on_hyphens=True
258 | )
259 | wrapped_lines.extend(wrapped.split("\n"))
260 |
261 | self.__cached_content_lines = wrapped_lines
262 | return wrapped_lines
263 |
264 | def __scroll_down(self, lines: int = 1) -> None:
265 | """
266 | Scroll down by specified number of lines.
267 |
268 | Args:
269 | lines: Number of lines to scroll down
270 | """
271 | max_scroll: int = self.__calculate_max_scroll()
272 | self.__scroll_offset = min(max_scroll, self.__scroll_offset + lines)
273 |
274 | def __scroll_page_down(self, page_size: int = 10) -> None:
275 | """
276 | Scroll down by page.
277 |
278 | Args:
279 | page_size: Number of lines to scroll for a page
280 | """
281 | self.__scroll_down(page_size)
282 |
283 | def __scroll_page_up(self, page_size: int = 10) -> None:
284 | """
285 | Scroll up by page.
286 |
287 | Args:
288 | page_size: Number of lines to scroll for a page
289 | """
290 | self.__scroll_up(page_size)
291 |
292 | def __scroll_to_bottom(self) -> None:
293 | """
294 | Scroll to bottom of document.
295 | """
296 | self.__scroll_offset = self.__calculate_max_scroll()
297 |
298 | def __scroll_to_top(self) -> None:
299 | """
300 | Scroll to top of document.
301 | """
302 | self.__scroll_offset = 0
303 |
304 | def __scroll_up(self, lines: int = 1) -> None:
305 | """
306 | Scroll up by specified number of lines.
307 |
308 | Args:
309 | lines: Number of lines to scroll up
310 | """
311 | self.__scroll_offset = max(0, self.__scroll_offset - lines)
312 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/utils/tools.py:
--------------------------------------------------------------------------------
```python
1 | from mcp.types import Tool
2 |
3 | from mcp_server_webcrawl.models.resources import (
4 | ResourceResultType,
5 | RESOURCES_FIELDS_BASE,
6 | RESOURCES_FIELDS_OPTIONS,
7 | RESOURCES_DEFAULT_SORT_MAPPING,
8 | RESOURCES_TOOL_NAME,
9 | )
10 | from mcp_server_webcrawl.models.sites import (
11 | SiteResult,
12 | SITES_FIELDS_DEFAULT,
13 | SITES_FIELDS_BASE,
14 | SITES_TOOL_NAME,
15 | )
16 |
17 | def get_crawler_tools(sites: list[SiteResult] | None = None):
18 | """
19 | Generate crawler tools based on available sites.
20 |
21 | Args:
22 | sites: optional list of site results to include in tool descriptions
23 |
24 | Returns:
25 | List of Tool objects for sites and resources
26 | """
27 |
28 | # you'd think maybe pass these in, but no, descriptions will also require tweaking
29 | # each crawler having its own peculiarities -- just let the subclass hack this
30 | # into whatever misshapen ball of clay it needs to be
31 |
32 | sites_field_options = list(set(SITES_FIELDS_DEFAULT) - set(SITES_FIELDS_BASE))
33 | resources_type_options = list(ResourceResultType.values())
34 | resources_sort_options = list(RESOURCES_DEFAULT_SORT_MAPPING.keys())
35 | sites_display = ", ".join([f"{s.name} (site: {s.id})" for s in sites]) if sites is not None else ""
36 | sites_ids = [s.id for s in sites]
37 |
38 | tools = [
39 | Tool(
40 | name=SITES_TOOL_NAME,
41 | description="Retrieves a list of sites (project websites or crawl directories).",
42 | inputSchema={
43 | "type": "object",
44 | "properties": {
45 | "ids": {
46 | "type": "array",
47 | "items": {"type": "integer"},
48 | "description": "List of project IDs to retrieve. Leave empty for all projects."
49 | },
50 | "fields": {
51 | "type": "array",
52 | "items": {
53 | "enum": sites_field_options
54 | },
55 | "description": ("List of additional fields to include in the response beyond the defaults "
56 | "(id, name, type, urls) Empty list means default fields only. Options include created (ISO 8601), "
57 | "modified (ISO 8601).")
58 | }
59 | },
60 | "required": []
61 | },
62 | ),
63 | Tool(
64 | name=RESOURCES_TOOL_NAME,
65 | description= ("Searches for resources (webpages, images, CSS, JS, etc.) across web crawler projects and "
66 | "retrieves specified fields. "
67 | "Supports boolean queries and field searching, along with site filtering to "
68 | "filter with fine control. "
69 | "To find a site homepage reliably, query type: html AND url: example.com (crawled domain) with sort='+url' and a LIMIT of 1. "
70 | "This pattern works consistently across all crawlers."
71 | "Most sites indexed by this tool will be small to moderately sized websites. "
72 | "Don't assume most keywords will generate results; start broader on first search (until you have a feel for results). "
73 | "A vital aspect of this API is field control; you can open up the limit wide when dealing with lightweight "
74 | "fields and dial way back when using larger fields, like content. Adjust dynamically. The best strategy "
75 | "balances preserving the user's context window while minimizing number of queries necessary to answer their question."
76 | ),
77 | inputSchema={
78 | "type": "object",
79 | "properties": {
80 | "query": {
81 | "type": "string",
82 | "description": ("The query field is the workhorse of the API and supports fulltext boolean queries "
83 | "along with field searching using the name: value pattern. "
84 | "Fields supported include page/resource id as id: <resource_id|int> (OR together for multiple docs), "
85 | "HTTP status as status: <code|int>, URL as url: <url|str>, and content type as type: <type|str>. "
86 | f"Valid types include ({', '.join(resources_type_options)}). "
87 | "Additionally, headers as headers: <term|str> and content as content: <term|str> can be "
88 | "searched specifically. You would only search content when fulltext search is diluted by other field hits. "
89 | "For the status field, numerical operators are supported, e.g. status: >=400. "
90 | "For the url and type fields, along with fulltext search terms (fieldless), FTS5 stem* suffix "
91 | "wildcarding is enabled. An empty query returns all results. "
92 | "A query MUST use one of these formats: (1) empty query for unfiltered results, (2) single keyword, "
93 | "(3) quoted phrase: \"keyword1 keyword2\", (4) "
94 | "explicit AND: keyword1 AND type: html, (5) explicit OR: keyword1 OR keyword2, or (6) advanced boolean: "
95 | "(keyword1 OR keyword2) AND (status: 200 AND type: html). "
96 | "The search index does not support stemming, use wildcards (keyword*), or the boolean OR and your "
97 | "imagination instead."
98 | )
99 | },
100 | "sites": {
101 | "type": "array",
102 | "items": {
103 | "enum": sites_ids
104 | },
105 | "description": ("List of crawl site IDs to filter search results to a specific site. In most "
106 | "scenarios, you should filter to only one site, but multiple site filtering is offered for "
107 | f"advanced search scenarios. Available sites include {sites_display}.")
108 | },
109 | "fields": {
110 | "type": "array",
111 | "items": {
112 | "enum": RESOURCES_FIELDS_OPTIONS
113 | },
114 | "description": ("List of additional fields to include in the response beyond the base fields "
115 | f"({', '.join(RESOURCES_FIELDS_BASE)}) returned for all results. "
116 | "Empty list means base fields only. Use headers and content to retrieve raw HTTP contents, "
117 | "and size to collect file size in bytes. "
118 | "The content field can lead to large results and should be used judiciously with LIMIT. "
119 | "Fields must be explicitly requested, even when used with sort. ")
120 | },
121 | "sort": {
122 | "enum": resources_sort_options,
123 | "default": "+url",
124 | "description": ("Sort order for results. Prefixed with + for ascending, - for descending "
125 | f"({', '.join(resources_sort_options)}). "
126 | "? is a special option for random sort, useful in statistical sampling. The API expects exactly "
127 | "one of the enum values above, not a quoted string.")
128 | },
129 | "limit": {
130 | "type": "integer",
131 | "description": "Maximum number of results to return. Default is 20, max is 100."
132 | },
133 | "offset": {
134 | "type": "integer",
135 | "description": "Number of results to skip for pagination. Default is 0."
136 | },
137 | "extras": {
138 | "type": "array",
139 | "items": {
140 | "enum": ["thumbnails", "markdown", "snippets", "regex", "xpath"]
141 | },
142 | "description": ("Optional array of extra features to include in results. Available options include:\n"
143 | "- 'thumbnails': generates base64 encoded thumbnails for image resources that can be viewed and "
144 | "analyzed by AI models. Enables image description, content analysis, and visual understanding while"
145 | "keeping token output minimal. Only works for image "
146 | "(img) types, which can be filtered using `type: img` in queries. SVG is not supported.\n"
147 | "- 'markdown': transforms the HTML content field into concise markdown, "
148 | "reducing token usage and improving readability for LLMs.\n"
149 | "- 'snippets': matches fulltext queries to contextual keyword usage within the content. When "
150 | "used without requesting the content field (or markdown extra), it can provide an efficient means "
151 | "of refining a search without pulling down the complete page contents. Also great for rendering "
152 | "old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, "
153 | "or any text-based, crawled file.\n"
154 | "- 'regex': extracts regular expression matches from crawled files such as HTML, CSS, JavaScript, "
155 | "etc.. Not as precise a tool as xpath for HTML, but supports any text file as a data source. "
156 | "- 'xpath': extracts xpath selector data. Supports count(). Use xpath's text() for "
157 | "text only, element selectors for HTML data. Only supported for HTML, other "
158 | "types will be ignored. Sometimes referred to as scraping."
159 | "")
160 | },
161 | "extrasRegex": {
162 | "type": "array",
163 | "items": {
164 | "type": "string"
165 | },
166 | "description": ("Array of regular expression patterns to extract content. "
167 | "Examples: `\\d{3}-\\d{3}-\\d{4}` (phone numbers), `https?://[^\\s]+` (URLs). "
168 | "Use capture groups `(pattern)` to extract specific parts. "
169 | "Only used when 'regex' is included in the extras array. "
170 | "Results include matches, capture groups, and position information.")
171 | },
172 | "extrasXpath": {
173 | "type": "array",
174 | "items": {
175 | "type": "string"
176 | },
177 | "description": ("Array of XPath expressions to extract specific content from HTML resources. "
178 | "Each XPath should be a valid selector expression like `/html/body/h1`, `//h1/text()`, "
179 | "//a, //a/@href, or count(//a). If you need many values (such as connected a/text() "
180 | "and a/@href), request elements to preserve the relationship. "
181 | "Use text() or @name when targeting text, elements will return outer HTML. "
182 | "Only used when 'xpath' is included in the extras array. Many xpath expressions can be "
183 | "passed at once to extract multiple selectors. Results are grouped by document within results. ")
184 | }
185 | },
186 | "required": []
187 | },
188 | ),
189 | ]
190 |
191 | return tools
192 |
```
--------------------------------------------------------------------------------
/docs/modules.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="./">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>mcp_server_webcrawl — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="_static/jquery.js?v=5d32c60e"></script>
15 | <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="_static/doctools.js?v=888ff710"></script>
18 | <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="genindex.html" />
21 | <link rel="search" title="Search" href="search.html" />
22 | <link rel="next" title="mcp_server_webcrawl package" href="mcp_server_webcrawl.html" />
23 | <link rel="prev" title="Interactive Mode" href="interactive.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
50 | <li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
51 | <li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
52 | <li class="toctree-l1 current"><a class="current reference internal" href="#">mcp_server_webcrawl</a><ul>
53 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
54 | </ul>
55 | </li>
56 | </ul>
57 |
58 | </div>
59 | </div>
60 | </nav>
61 |
62 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
63 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
64 | <a href="index.html">mcp-server-webcrawl</a>
65 | </nav>
66 |
67 | <div class="wy-nav-content">
68 | <div class="rst-content">
69 | <div role="navigation" aria-label="Page navigation">
70 | <ul class="wy-breadcrumbs">
71 | <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
72 | <li class="breadcrumb-item active">mcp_server_webcrawl</li>
73 | <li class="wy-breadcrumbs-aside">
74 | <a href="_sources/modules.rst.txt" rel="nofollow"> View page source</a>
75 | </li>
76 | </ul>
77 | <hr/>
78 | </div>
79 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
80 | <div itemprop="articleBody">
81 |
82 | <section id="mcp-server-webcrawl">
83 | <h1>mcp_server_webcrawl<a class="headerlink" href="#mcp-server-webcrawl" title="Link to this heading"></a></h1>
84 | <div class="toctree-wrapper compound">
85 | <ul>
86 | <li class="toctree-l1"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a><ul>
87 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#subpackages">Subpackages</a><ul>
88 | <li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a><ul>
89 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html#subpackages">Subpackages</a></li>
90 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.crawlers.html#module-mcp_server_webcrawl.crawlers">Module contents</a></li>
91 | </ul>
92 | </li>
93 | <li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.extras.html">mcp_server_webcrawl.extras package</a><ul>
94 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#submodules">Submodules</a></li>
95 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.markdown">mcp_server_webcrawl.extras.markdown module</a></li>
96 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.regex">mcp_server_webcrawl.extras.regex module</a></li>
97 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.snippets">mcp_server_webcrawl.extras.snippets module</a></li>
98 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.thumbnails">mcp_server_webcrawl.extras.thumbnails module</a></li>
99 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras.xpath">mcp_server_webcrawl.extras.xpath module</a></li>
100 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.extras.html#module-mcp_server_webcrawl.extras">Module contents</a></li>
101 | </ul>
102 | </li>
103 | <li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.interactive.html">mcp_server_webcrawl.interactive package</a><ul>
104 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#submodules">Submodules</a></li>
105 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.highlights">mcp_server_webcrawl.interactive.highlights module</a></li>
106 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.search">mcp_server_webcrawl.interactive.search module</a></li>
107 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.session">mcp_server_webcrawl.interactive.session module</a></li>
108 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive.ui">mcp_server_webcrawl.interactive.ui module</a></li>
109 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.interactive.html#module-mcp_server_webcrawl.interactive">Module contents</a></li>
110 | </ul>
111 | </li>
112 | <li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.models.html">mcp_server_webcrawl.models package</a><ul>
113 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#submodules">Submodules</a></li>
114 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models.resources">mcp_server_webcrawl.models.resources module</a></li>
115 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models.sites">mcp_server_webcrawl.models.sites module</a></li>
116 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.models.html#module-mcp_server_webcrawl.models">Module contents</a></li>
117 | </ul>
118 | </li>
119 | <li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.templates.html">mcp_server_webcrawl.templates package</a><ul>
120 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.templates.html#submodules">Submodules</a></li>
121 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.templates.html#module-mcp_server_webcrawl.templates.tests">mcp_server_webcrawl.templates.tests module</a></li>
122 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.templates.html#module-mcp_server_webcrawl.templates">Module contents</a></li>
123 | </ul>
124 | </li>
125 | <li class="toctree-l3"><a class="reference internal" href="mcp_server_webcrawl.utils.html">mcp_server_webcrawl.utils package</a><ul>
126 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#submodules">Submodules</a></li>
127 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.cli">mcp_server_webcrawl.utils.cli module</a></li>
128 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.logger">mcp_server_webcrawl.utils.logger module</a></li>
129 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.server">mcp_server_webcrawl.utils.server module</a></li>
130 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils.tools">mcp_server_webcrawl.utils.tools module</a></li>
131 | <li class="toctree-l4"><a class="reference internal" href="mcp_server_webcrawl.utils.html#module-mcp_server_webcrawl.utils">Module contents</a></li>
132 | </ul>
133 | </li>
134 | </ul>
135 | </li>
136 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#submodules">Submodules</a></li>
137 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#mcp-server-webcrawl-main-module">mcp_server_webcrawl.main module</a></li>
138 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#module-mcp_server_webcrawl.settings">mcp_server_webcrawl.settings module</a></li>
139 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#module-mcp_server_webcrawl.settings_local">mcp_server_webcrawl.settings_local module</a></li>
140 | <li class="toctree-l2"><a class="reference internal" href="mcp_server_webcrawl.html#module-mcp_server_webcrawl">Module contents</a></li>
141 | </ul>
142 | </li>
143 | </ul>
144 | </div>
145 | </section>
146 |
147 |
148 | </div>
149 | </div>
150 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
151 | <a href="interactive.html" class="btn btn-neutral float-left" title="Interactive Mode" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
152 | <a href="mcp_server_webcrawl.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
153 | </div>
154 |
155 | <hr/>
156 |
157 | <div role="contentinfo">
158 | <p>© Copyright 2025, pragmar.</p>
159 | </div>
160 |
161 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
162 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
163 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
164 |
165 |
166 | </footer>
167 | </div>
168 | </div>
169 | </section>
170 | </div>
171 | <script>
172 | jQuery(function () {
173 | SphinxRtdTheme.Navigation.enable(true);
174 | });
175 | </script>
176 |
177 | </body>
178 | </html>
```
--------------------------------------------------------------------------------
/docs/guides/wget.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../">
5 | <head>
6 | <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
7 |
8 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9 | <title>wget MCP Setup Guide — mcp-server-webcrawl documentation</title>
10 | <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
11 | <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
12 |
13 |
14 | <script src="../_static/jquery.js?v=5d32c60e"></script>
15 | <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
16 | <script src="../_static/documentation_options.js?v=5929fcd5"></script>
17 | <script src="../_static/doctools.js?v=888ff710"></script>
18 | <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
19 | <script src="../_static/js/theme.js"></script>
20 | <link rel="index" title="Index" href="../genindex.html" />
21 | <link rel="search" title="Search" href="../search.html" />
22 | <link rel="next" title="Usage" href="../usage.html" />
23 | <link rel="prev" title="WARC MCP Setup Guide" href="warc.html" />
24 | </head>
25 |
26 | <body class="wy-body-for-nav">
27 | <div class="wy-grid-for-nav">
28 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
29 | <div class="wy-side-scroll">
30 | <div class="wy-side-nav-search" >
31 |
32 |
33 |
34 | <a href="../index.html" class="icon icon-home">
35 | mcp-server-webcrawl
36 | </a>
37 | <div role="search">
38 | <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
39 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
40 | <input type="hidden" name="check_keywords" value="yes" />
41 | <input type="hidden" name="area" value="default" />
42 | </form>
43 | </div>
44 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
45 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
46 | <ul class="current">
47 | <li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
48 | <li class="toctree-l1 current"><a class="reference internal" href="../guides.html">Setup Guides</a><ul class="current">
49 | <li class="toctree-l2"><a class="reference internal" href="archivebox.html">ArchiveBox MCP Setup Guide</a></li>
50 | <li class="toctree-l2"><a class="reference internal" href="httrack.html">HTTrack MCP Setup Guide</a></li>
51 | <li class="toctree-l2"><a class="reference internal" href="interrobot.html">InterroBot MCP Setup Guide</a></li>
52 | <li class="toctree-l2"><a class="reference internal" href="katana.html">Katana MCP Setup Guide</a></li>
53 | <li class="toctree-l2"><a class="reference internal" href="siteone.html">SiteOne MCP Setup Guide</a></li>
54 | <li class="toctree-l2"><a class="reference internal" href="warc.html">WARC MCP Setup Guide</a></li>
55 | <li class="toctree-l2 current"><a class="current reference internal" href="#">wget MCP Setup Guide</a></li>
56 | </ul>
57 | </li>
58 | <li class="toctree-l1"><a class="reference internal" href="../usage.html">Usage</a></li>
59 | <li class="toctree-l1"><a class="reference internal" href="../prompts.html">Prompt Routines</a></li>
60 | <li class="toctree-l1"><a class="reference internal" href="../modules.html">mcp_server_webcrawl</a></li>
61 | </ul>
62 |
63 | </div>
64 | </div>
65 | </nav>
66 |
67 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
68 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
69 | <a href="../index.html">mcp-server-webcrawl</a>
70 | </nav>
71 |
72 | <div class="wy-nav-content">
73 | <div class="rst-content">
74 | <div role="navigation" aria-label="Page navigation">
75 | <ul class="wy-breadcrumbs">
76 | <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
77 | <li class="breadcrumb-item"><a href="../guides.html">Setup Guides</a></li>
78 | <li class="breadcrumb-item active">wget MCP Setup Guide</li>
79 | <li class="wy-breadcrumbs-aside">
80 | <a href="../_sources/guides/wget.rst.txt" rel="nofollow"> View page source</a>
81 | </li>
82 | </ul>
83 | <hr/>
84 | </div>
85 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
86 | <div itemprop="articleBody">
87 |
88 | <section id="wget-mcp-setup-guide">
89 | <h1>wget MCP Setup Guide<a class="headerlink" href="#wget-mcp-setup-guide" title="Link to this heading"></a></h1>
90 | <p>Instructions for setting up <a class="reference external" href="https://pragmar.com/mcp-server-webcrawl/">mcp-server-webcrawl</a> with
91 | <a class="reference external" href="https://en.wikipedia.org/wiki/Wget">wget</a>.
92 | This allows your LLM (e.g. Claude Desktop) to search content and metadata from websites you’ve crawled.</p>
93 | <iframe width="560" height="315" src="https://www.youtube.com/embed/uqEEqVsofhc" frameborder="0" allowfullscreen></iframe><p>Follow along with the video, or the step-action guide.</p>
94 | <section id="requirements">
95 | <h2>Requirements<a class="headerlink" href="#requirements" title="Link to this heading"></a></h2>
96 | <p>Before you begin, ensure you have:</p>
97 | <ul class="simple">
98 | <li><p><a class="reference external" href="https://claude.ai/download">Claude Desktop</a> installed</p></li>
99 | <li><p><a class="reference external" href="https://python.org">Python</a> 3.10 or later installed</p></li>
100 | <li><p>Basic familiarity with command line interfaces</p></li>
101 | <li><p>wget installed (macOS users can install via Homebrew, Windows users need WSL/Ubuntu)</p></li>
102 | </ul>
103 | </section>
104 | <section id="installation-steps">
105 | <h2>Installation Steps<a class="headerlink" href="#installation-steps" title="Link to this heading"></a></h2>
106 | <section id="install-mcp-server-webcrawl">
107 | <h3>1. Install mcp-server-webcrawl<a class="headerlink" href="#install-mcp-server-webcrawl" title="Link to this heading"></a></h3>
108 | <p>Open your terminal or command line and install the package:</p>
109 | <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>mcp-server-webcrawl
110 | </pre></div>
111 | </div>
112 | <p>Verify installation was successful by checking the version:</p>
113 | <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>mcp-server-webcrawl<span class="w"> </span>--help
114 | </pre></div>
115 | </div>
116 | </section>
117 | <section id="configure-claude-desktop">
118 | <h3>2. Configure Claude Desktop<a class="headerlink" href="#configure-claude-desktop" title="Link to this heading"></a></h3>
119 | <ol class="arabic simple">
120 | <li><p>Open Claude Desktop</p></li>
121 | <li><p>Go to <strong>File → Settings → Developer → Edit Config</strong></p></li>
122 | <li><p>Add the following configuration (modify paths as needed):</p></li>
123 | </ol>
124 | <div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
125 | <span class="w"> </span><span class="nt">"mcpServers"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
126 | <span class="w"> </span><span class="nt">"webcrawl"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
127 | <span class="w"> </span><span class="nt">"command"</span><span class="p">:</span><span class="w"> </span><span class="s2">"/path/to/mcp-server-webcrawl"</span><span class="p">,</span>
128 | <span class="w"> </span><span class="nt">"args"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"--crawler"</span><span class="p">,</span><span class="w"> </span><span class="s2">"wget"</span><span class="p">,</span><span class="w"> </span><span class="s2">"--datasrc"</span><span class="p">,</span>
129 | <span class="w"> </span><span class="s2">"/path/to/wget/archives/"</span><span class="p">]</span>
130 | <span class="w"> </span><span class="p">}</span>
131 | <span class="w"> </span><span class="p">}</span>
132 | <span class="p">}</span>
133 | </pre></div>
134 | </div>
135 | <div class="admonition note">
136 | <p class="admonition-title">Note</p>
137 | <ul class="simple">
138 | <li><p>On Windows, use <code class="docutils literal notranslate"><span class="pre">"mcp-server-webcrawl"</span></code> as the command</p></li>
139 | <li><p>On macOS, use the absolute path (output of <code class="docutils literal notranslate"><span class="pre">which</span> <span class="pre">mcp-server-webcrawl</span></code>)</p></li>
140 | <li><p>Change <code class="docutils literal notranslate"><span class="pre">/path/to/wget/archives/</span></code> to your actual directory path</p></li>
141 | </ul>
142 | </div>
143 | <ol class="arabic simple" start="4">
144 | <li><p>Save the file and <strong>completely exit</strong> Claude Desktop (not just close the window)</p></li>
145 | <li><p>Restart Claude Desktop</p></li>
146 | </ol>
147 | </section>
148 | <section id="crawl-websites-with-wget">
149 | <h3>3. Crawl Websites with wget<a class="headerlink" href="#crawl-websites-with-wget" title="Link to this heading"></a></h3>
150 | <ol class="arabic simple">
151 | <li><p>Open Terminal (macOS) or Ubuntu/WSL (Windows)</p></li>
152 | <li><p>Navigate to your target directory for storing crawls</p></li>
153 | <li><p>Run wget with the mirror option:</p></li>
154 | </ol>
155 | <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>wget<span class="w"> </span>--mirror<span class="w"> </span>https://example.com
156 | </pre></div>
157 | </div>
158 | </section>
159 | <section id="verify-and-use">
160 | <h3>4. Verify and Use<a class="headerlink" href="#verify-and-use" title="Link to this heading"></a></h3>
161 | <ol class="arabic simple">
162 | <li><p>In Claude Desktop, you should now see an MCP tool option under Search and Tools</p></li>
163 | <li><p>Ask Claude to list your crawled sites:</p></li>
164 | </ol>
165 | <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Can you list the crawled sites available?
166 | </pre></div>
167 | </div>
168 | <ol class="arabic simple" start="3">
169 | <li><p>Try searching content from your crawls:</p></li>
170 | </ol>
171 | <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>Can you find information about [topic] on [crawled site]?
172 | </pre></div>
173 | </div>
174 | </section>
175 | </section>
176 | <section id="troubleshooting">
177 | <h2>Troubleshooting<a class="headerlink" href="#troubleshooting" title="Link to this heading"></a></h2>
178 | <ul class="simple">
179 | <li><p>If Claude doesn’t show MCP tools after restart, verify your configuration file is correctly formatted</p></li>
180 | <li><p>Ensure Python and mcp-server-webcrawl are properly installed, and on PATH or using absolute paths</p></li>
181 | <li><p>Check that your crawl directory path in the configuration is correct</p></li>
182 | <li><p>Remember that the first time you use a function, Claude will ask for permission</p></li>
183 | <li><p>Indexing for file-based archives (wget included) requires build time on first search, time is dependent on archive size</p></li>
184 | </ul>
185 | <p>For more details, including API documentation and other crawler options, visit the <a class="reference external" href="https://github.com/pragmar/mcp-server-webcrawl">mcp-server-webcrawl documentation</a>.</p>
186 | </section>
187 | </section>
188 |
189 |
190 | </div>
191 | </div>
192 | <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
193 | <a href="warc.html" class="btn btn-neutral float-left" title="WARC MCP Setup Guide" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
194 | <a href="../usage.html" class="btn btn-neutral float-right" title="Usage" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
195 | </div>
196 |
197 | <hr/>
198 |
199 | <div role="contentinfo">
200 | <p>© Copyright 2025, pragmar.</p>
201 | </div>
202 |
203 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
204 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
205 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
206 |
207 |
208 | </footer>
209 | </div>
210 | </div>
211 | </section>
212 | </div>
213 | <script>
214 | jQuery(function () {
215 | SphinxRtdTheme.Navigation.enable(true);
216 | });
217 | </script>
218 |
219 | </body>
220 | </html>
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/extras/markdown.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.extras.markdown — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item active">mcp_server_webcrawl.extras.markdown</li>
67 | <li class="wy-breadcrumbs-aside">
68 | </li>
69 | </ul>
70 | <hr/>
71 | </div>
72 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
73 | <div itemprop="articleBody">
74 |
75 | <h1>Source code for mcp_server_webcrawl.extras.markdown</h1><div class="highlight"><pre>
76 | <span></span><span class="kn">import</span> <span class="nn">re</span>
77 | <span class="kn">from</span> <span class="nn">importlib</span> <span class="kn">import</span> <span class="n">resources</span>
78 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span><span class="p">,</span> <span class="n">Any</span>
79 | <span class="kn">from</span> <span class="nn">lxml</span> <span class="kn">import</span> <span class="n">etree</span><span class="p">,</span> <span class="n">html</span>
80 | <span class="kn">from</span> <span class="nn">lxml.etree</span> <span class="kn">import</span> <span class="n">ParserError</span>
81 | <span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
82 |
83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
84 |
85 | <span class="n">__XSLT_RESULT_CLEANER</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">re</span><span class="o">.</span><span class="n">Pattern</span><span class="p">]</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"(?:\n\s*-\s*\n|\n\s*\n)+"</span><span class="p">)</span>
86 | <span class="n">__RE_HTML</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">re</span><span class="o">.</span><span class="n">Pattern</span><span class="p">]</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"<[a-zA-Z]+[^>]*>"</span><span class="p">)</span>
87 |
88 | <span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
89 |
90 | <div class="viewcode-block" id="MarkdownTransformer">
91 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.markdown.MarkdownTransformer">[docs]</a>
92 | <span class="k">class</span> <span class="nc">MarkdownTransformer</span><span class="p">:</span>
93 | <span class="w"> </span><span class="sd">"""</span>
94 | <span class="sd"> Memoizes the XSLT transformer</span>
95 | <span class="sd"> """</span>
96 | <span class="n">_xslt_transform</span> <span class="o">=</span> <span class="kc">None</span>
97 |
98 | <div class="viewcode-block" id="MarkdownTransformer.get_xslt_transform">
99 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.markdown.MarkdownTransformer.get_xslt_transform">[docs]</a>
100 | <span class="nd">@classmethod</span>
101 | <span class="k">def</span> <span class="nf">get_xslt_transform</span><span class="p">(</span><span class="bp">cls</span><span class="p">):</span>
102 | <span class="w"> </span><span class="sd">"""</span>
103 | <span class="sd"> Get the HTML to text markdown XSLT transformer</span>
104 | <span class="sd"> """</span>
105 | <span class="k">if</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_xslt_transform</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
106 | <span class="n">xslt_string</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">resources</span><span class="o">.</span><span class="n">read_text</span><span class="p">(</span><span class="s2">"mcp_server_webcrawl.templates"</span><span class="p">,</span> <span class="s2">"markdown.xslt"</span><span class="p">)</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span>
107 | <span class="n">xslt_doc</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">xslt_string</span><span class="p">)</span>
108 | <span class="bp">cls</span><span class="o">.</span><span class="n">_xslt_transform</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XSLT</span><span class="p">(</span><span class="n">xslt_doc</span><span class="p">)</span>
109 | <span class="k">return</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_xslt_transform</span></div>
110 | </div>
111 |
112 |
113 | <div class="viewcode-block" id="get_markdown">
114 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.markdown.get_markdown">[docs]</a>
115 | <span class="k">def</span> <span class="nf">get_markdown</span><span class="p">(</span><span class="n">content</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
116 | <span class="w"> </span><span class="sd">"""</span>
117 | <span class="sd"> Transform HTML content to Markdown using XSLT.</span>
118 |
119 | <span class="sd"> Args:</span>
120 | <span class="sd"> content (str): The HTML content to transform.</span>
121 |
122 | <span class="sd"> Returns:</span>
123 | <span class="sd"> str | None: The transformed Markdown string, or None if the input is empty</span>
124 | <span class="sd"> or if transformation fails (e.g., due to invalid HTML or XSLT errors).</span>
125 | <span class="sd"> """</span>
126 |
127 | <span class="n">transformer</span> <span class="o">=</span> <span class="n">MarkdownTransformer</span><span class="o">.</span><span class="n">get_xslt_transform</span><span class="p">()</span>
128 | <span class="n">content</span><span class="p">:</span><span class="nb">str</span> <span class="o">=</span> <span class="n">content</span> <span class="ow">or</span> <span class="s2">""</span>
129 | <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">content</span><span class="p">,</span> <span class="nb">str</span><span class="p">),</span> <span class="s2">"String (HTML) required for transformer"</span>
130 | <span class="k">assert</span> <span class="n">transformer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
131 |
132 | <span class="k">if</span> <span class="n">content</span> <span class="o">==</span> <span class="s2">""</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">__RE_HTML</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">content</span><span class="p">):</span>
133 | <span class="k">return</span> <span class="kc">None</span>
134 |
135 | <span class="k">try</span><span class="p">:</span>
136 | <span class="n">doc</span> <span class="o">=</span> <span class="n">html</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">content</span><span class="p">)</span>
137 | <span class="n">result</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">transformer</span><span class="p">(</span><span class="n">doc</span><span class="p">))</span>
138 | <span class="n">result</span> <span class="o">=</span> <span class="n">__XSLT_RESULT_CLEANER</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">"</span><span class="se">\n\n</span><span class="s2">"</span><span class="p">,</span> <span class="n">result</span><span class="p">)</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
139 | <span class="k">return</span> <span class="n">result</span>
140 |
141 | <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
142 | <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"XSLT transform error: </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">ex</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="se">\n</span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
143 | <span class="k">return</span> <span class="kc">None</span></div>
144 |
145 | </pre></div>
146 |
147 | </div>
148 | </div>
149 | <footer>
150 |
151 | <hr/>
152 |
153 | <div role="contentinfo">
154 | <p>© Copyright 2025, pragmar.</p>
155 | </div>
156 |
157 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
158 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
159 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
160 |
161 |
162 | </footer>
163 | </div>
164 | </div>
165 | </section>
166 | </div>
167 | <script>
168 | jQuery(function () {
169 | SphinxRtdTheme.Navigation.enable(true);
170 | });
171 | </script>
172 |
173 | </body>
174 | </html>
```