This is page 8 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/katana/tests.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.katana.tests — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
67 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.katana.tests</li>
68 | <li class="wy-breadcrumbs-aside">
69 | </li>
70 | </ul>
71 | <hr/>
72 | </div>
73 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
74 | <div itemprop="articleBody">
75 |
76 | <h1>Source code for mcp_server_webcrawl.crawlers.katana.tests</h1><div class="highlight"><pre>
77 | <span></span><span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
78 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.crawler</span> <span class="kn">import</span> <span class="n">KatanaCrawler</span>
79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.adapter</span> <span class="kn">import</span> <span class="n">KatanaManager</span>
80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.adapter</span> <span class="kn">import</span> <span class="n">SitesGroup</span>
81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
84 |
85 | <span class="c1"># calculate ids for test directories using the same hash function as adapter</span>
86 | <span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="n">KatanaManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"example.com"</span><span class="p">)</span>
87 | <span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="n">KatanaManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">"pragmar.com"</span><span class="p">)</span>
88 |
89 | <span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
90 |
91 | <div class="viewcode-block" id="KatanaTests">
92 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests">[docs]</a>
93 | <span class="k">class</span> <span class="nc">KatanaTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
94 | <span class="w"> </span><span class="sd">"""</span>
95 | <span class="sd"> test suite for the HTTP text crawler implementation.</span>
96 | <span class="sd"> tests parsing and retrieval of web content from HTTP text files.</span>
97 | <span class="sd"> """</span>
98 |
99 | <div class="viewcode-block" id="KatanaTests.setUp">
100 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.setUp">[docs]</a>
101 | <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
102 | <span class="w"> </span><span class="sd">"""</span>
103 | <span class="sd"> set up the test environment with fixture data.</span>
104 | <span class="sd"> """</span>
105 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
106 | <span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">"katana"</span></div>
107 |
108 |
109 | <div class="viewcode-block" id="KatanaTests.test_katana_pulse">
110 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_pulse">[docs]</a>
111 | <span class="k">def</span> <span class="nf">test_katana_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
112 | <span class="w"> </span><span class="sd">"""</span>
113 | <span class="sd"> basic crawler initialization.</span>
114 | <span class="sd"> """</span>
115 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
116 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
117 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
118 |
119 |
120 | <div class="viewcode-block" id="KatanaTests.test_katana_sites">
121 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_sites">[docs]</a>
122 | <span class="k">def</span> <span class="nf">test_katana_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
123 | <span class="w"> </span><span class="sd">"""</span>
124 | <span class="sd"> site retrieval API functionality.</span>
125 | <span class="sd"> """</span>
126 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
127 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
128 |
129 |
130 | <div class="viewcode-block" id="KatanaTests.test_katana_search">
131 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_search">[docs]</a>
132 | <span class="k">def</span> <span class="nf">test_katana_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
133 | <span class="w"> </span><span class="sd">"""</span>
134 | <span class="sd"> boolean search tests</span>
135 | <span class="sd"> """</span>
136 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
137 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
138 |
139 |
140 | <div class="viewcode-block" id="KatanaTests.test_pragmar_tokenizer">
141 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_pragmar_tokenizer">[docs]</a>
142 | <span class="k">def</span> <span class="nf">test_pragmar_tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
143 | <span class="w"> </span><span class="sd">"""</span>
144 | <span class="sd"> tokenizer search tests</span>
145 | <span class="sd"> """</span>
146 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
147 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_tokenizer_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
148 |
149 |
150 |
151 | <div class="viewcode-block" id="KatanaTests.test_katana_resources">
152 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_resources">[docs]</a>
153 | <span class="k">def</span> <span class="nf">test_katana_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
154 | <span class="w"> </span><span class="sd">"""</span>
155 | <span class="sd"> resource retrieval API functionality with various parameters.</span>
156 | <span class="sd"> """</span>
157 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
158 | <span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
159 |
160 |
161 | <div class="viewcode-block" id="KatanaTests.test_interrobot_images">
162 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_interrobot_images">[docs]</a>
163 | <span class="k">def</span> <span class="nf">test_interrobot_images</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
164 | <span class="w"> </span><span class="sd">"""</span>
165 | <span class="sd"> Test InterroBot-specific image handling and thumbnails.</span>
166 | <span class="sd"> """</span>
167 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
168 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_image_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
169 |
170 |
171 | <div class="viewcode-block" id="KatanaTests.test_katana_sorts">
172 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_sorts">[docs]</a>
173 | <span class="k">def</span> <span class="nf">test_katana_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
174 | <span class="w"> </span><span class="sd">"""</span>
175 | <span class="sd"> random sort functionality using the '?' sort parameter.</span>
176 | <span class="sd"> """</span>
177 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
178 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
179 |
180 |
181 | <div class="viewcode-block" id="KatanaTests.test_katana_content_parsing">
182 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_content_parsing">[docs]</a>
183 | <span class="k">def</span> <span class="nf">test_katana_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
184 | <span class="w"> </span><span class="sd">"""</span>
185 | <span class="sd"> content type detection and parsing for HTTP text files.</span>
186 | <span class="sd"> """</span>
187 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
188 | <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
189 |
190 |
191 | <div class="viewcode-block" id="KatanaTests.test_report">
192 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_report">[docs]</a>
193 | <span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
194 | <span class="w"> </span><span class="sd">"""</span>
195 | <span class="sd"> Run test report, save to data directory.</span>
196 | <span class="sd"> """</span>
197 | <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
198 | <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">"Katana"</span><span class="p">))</span></div>
199 | </div>
200 |
201 | </pre></div>
202 |
203 | </div>
204 | </div>
205 | <footer>
206 |
207 | <hr/>
208 |
209 | <div role="contentinfo">
210 | <p>© Copyright 2025, pragmar.</p>
211 | </div>
212 |
213 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
214 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
215 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
216 |
217 |
218 | </footer>
219 | </div>
220 | </div>
221 | </section>
222 | </div>
223 | <script>
224 | jQuery(function () {
225 | SphinxRtdTheme.Navigation.enable(true);
226 | });
227 | </script>
228 |
229 | </body>
230 | </html>
```
--------------------------------------------------------------------------------
/docs/_static/searchtools.js:
--------------------------------------------------------------------------------
```javascript
1 | /*
2 | * searchtools.js
3 | * ~~~~~~~~~~~~~~~~
4 | *
5 | * Sphinx JavaScript utilities for the full-text search.
6 | *
7 | * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
8 | * :license: BSD, see LICENSE for details.
9 | *
10 | */
11 | "use strict";
12 |
13 | /**
14 | * Simple result scoring code.
15 | */
16 | if (typeof Scorer === "undefined") {
17 | var Scorer = {
18 | // Implement the following function to further tweak the score for each result
19 | // The function takes a result array [docname, title, anchor, descr, score, filename]
20 | // and returns the new score.
21 | /*
22 | score: result => {
23 | const [docname, title, anchor, descr, score, filename] = result
24 | return score
25 | },
26 | */
27 |
28 | // query matches the full name of an object
29 | objNameMatch: 11,
30 | // or matches in the last dotted part of the object name
31 | objPartialMatch: 6,
32 | // Additive scores depending on the priority of the object
33 | objPrio: {
34 | 0: 15, // used to be importantResults
35 | 1: 5, // used to be objectResults
36 | 2: -5, // used to be unimportantResults
37 | },
38 | // Used when the priority is not in the mapping.
39 | objPrioDefault: 0,
40 |
41 | // query found in title
42 | title: 15,
43 | partialTitle: 7,
44 | // query found in terms
45 | term: 5,
46 | partialTerm: 2,
47 | };
48 | }
49 |
50 | const _removeChildren = (element) => {
51 | while (element && element.lastChild) element.removeChild(element.lastChild);
52 | };
53 |
54 | /**
55 | * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping
56 | */
57 | const _escapeRegExp = (string) =>
58 | string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
59 |
60 | const _displayItem = (item, searchTerms, highlightTerms) => {
61 | const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
62 | const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
63 | const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
64 | const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
65 | const contentRoot = document.documentElement.dataset.content_root;
66 |
67 | const [docName, title, anchor, descr, score, _filename] = item;
68 |
69 | let listItem = document.createElement("li");
70 | let requestUrl;
71 | let linkUrl;
72 | if (docBuilder === "dirhtml") {
73 | // dirhtml builder
74 | let dirname = docName + "/";
75 | if (dirname.match(/\/index\/$/))
76 | dirname = dirname.substring(0, dirname.length - 6);
77 | else if (dirname === "index/") dirname = "";
78 | requestUrl = contentRoot + dirname;
79 | linkUrl = requestUrl;
80 | } else {
81 | // normal html builders
82 | requestUrl = contentRoot + docName + docFileSuffix;
83 | linkUrl = docName + docLinkSuffix;
84 | }
85 | let linkEl = listItem.appendChild(document.createElement("a"));
86 | linkEl.href = linkUrl + anchor;
87 | linkEl.dataset.score = score;
88 | linkEl.innerHTML = title;
89 | if (descr) {
90 | listItem.appendChild(document.createElement("span")).innerHTML =
91 | " (" + descr + ")";
92 | // highlight search terms in the description
93 | if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
94 | highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
95 | }
96 | else if (showSearchSummary)
97 | fetch(requestUrl)
98 | .then((responseData) => responseData.text())
99 | .then((data) => {
100 | if (data)
101 | listItem.appendChild(
102 | Search.makeSearchSummary(data, searchTerms)
103 | );
104 | // highlight search terms in the summary
105 | if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
106 | highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
107 | });
108 | Search.output.appendChild(listItem);
109 | };
110 | const _finishSearch = (resultCount) => {
111 | Search.stopPulse();
112 | Search.title.innerText = _("Search Results");
113 | if (!resultCount)
114 | Search.status.innerText = Documentation.gettext(
115 | "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories."
116 | );
117 | else
118 | Search.status.innerText = _(
119 | `Search finished, found ${resultCount} page(s) matching the search query.`
120 | );
121 | };
122 | const _displayNextItem = (
123 | results,
124 | resultCount,
125 | searchTerms,
126 | highlightTerms,
127 | ) => {
128 | // results left, load the summary and display it
129 | // this is intended to be dynamic (don't sub resultsCount)
130 | if (results.length) {
131 | _displayItem(results.pop(), searchTerms, highlightTerms);
132 | setTimeout(
133 | () => _displayNextItem(results, resultCount, searchTerms, highlightTerms),
134 | 5
135 | );
136 | }
137 | // search finished, update title and status message
138 | else _finishSearch(resultCount);
139 | };
140 |
141 | /**
142 | * Default splitQuery function. Can be overridden in ``sphinx.search`` with a
143 | * custom function per language.
144 | *
145 | * The regular expression works by splitting the string on consecutive characters
146 | * that are not Unicode letters, numbers, underscores, or emoji characters.
147 | * This is the same as ``\W+`` in Python, preserving the surrogate pair area.
148 | */
149 | if (typeof splitQuery === "undefined") {
150 | var splitQuery = (query) => query
151 | .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
152 | .filter(term => term) // remove remaining empty strings
153 | }
154 |
155 | /**
156 | * Search Module
157 | */
158 | const Search = {
159 | _index: null,
160 | _queued_query: null,
161 | _pulse_status: -1,
162 |
163 | htmlToText: (htmlString) => {
164 | const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
165 | htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
166 | const docContent = htmlElement.querySelector('[role="main"]');
167 | if (docContent !== undefined) return docContent.textContent;
168 | console.warn(
169 | "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
170 | );
171 | return "";
172 | },
173 |
174 | init: () => {
175 | const query = new URLSearchParams(window.location.search).get("q");
176 | document
177 | .querySelectorAll('input[name="q"]')
178 | .forEach((el) => (el.value = query));
179 | if (query) Search.performSearch(query);
180 | },
181 |
182 | loadIndex: (url) =>
183 | (document.body.appendChild(document.createElement("script")).src = url),
184 |
185 | setIndex: (index) => {
186 | Search._index = index;
187 | if (Search._queued_query !== null) {
188 | const query = Search._queued_query;
189 | Search._queued_query = null;
190 | Search.query(query);
191 | }
192 | },
193 |
194 | hasIndex: () => Search._index !== null,
195 |
196 | deferQuery: (query) => (Search._queued_query = query),
197 |
198 | stopPulse: () => (Search._pulse_status = -1),
199 |
200 | startPulse: () => {
201 | if (Search._pulse_status >= 0) return;
202 |
203 | const pulse = () => {
204 | Search._pulse_status = (Search._pulse_status + 1) % 4;
205 | Search.dots.innerText = ".".repeat(Search._pulse_status);
206 | if (Search._pulse_status >= 0) window.setTimeout(pulse, 500);
207 | };
208 | pulse();
209 | },
210 |
211 | /**
212 | * perform a search for something (or wait until index is loaded)
213 | */
214 | performSearch: (query) => {
215 | // create the required interface elements
216 | const searchText = document.createElement("h2");
217 | searchText.textContent = _("Searching");
218 | const searchSummary = document.createElement("p");
219 | searchSummary.classList.add("search-summary");
220 | searchSummary.innerText = "";
221 | const searchList = document.createElement("ul");
222 | searchList.classList.add("search");
223 |
224 | const out = document.getElementById("search-results");
225 | Search.title = out.appendChild(searchText);
226 | Search.dots = Search.title.appendChild(document.createElement("span"));
227 | Search.status = out.appendChild(searchSummary);
228 | Search.output = out.appendChild(searchList);
229 |
230 | const searchProgress = document.getElementById("search-progress");
231 | // Some themes don't use the search progress node
232 | if (searchProgress) {
233 | searchProgress.innerText = _("Preparing search...");
234 | }
235 | Search.startPulse();
236 |
237 | // index already loaded, the browser was quick!
238 | if (Search.hasIndex()) Search.query(query);
239 | else Search.deferQuery(query);
240 | },
241 |
242 | /**
243 | * execute search (requires search index to be loaded)
244 | */
245 | query: (query) => {
246 | const filenames = Search._index.filenames;
247 | const docNames = Search._index.docnames;
248 | const titles = Search._index.titles;
249 | const allTitles = Search._index.alltitles;
250 | const indexEntries = Search._index.indexentries;
251 |
252 | // stem the search terms and add them to the correct list
253 | const stemmer = new Stemmer();
254 | const searchTerms = new Set();
255 | const excludedTerms = new Set();
256 | const highlightTerms = new Set();
257 | const objectTerms = new Set(splitQuery(query.toLowerCase().trim()));
258 | splitQuery(query.trim()).forEach((queryTerm) => {
259 | const queryTermLower = queryTerm.toLowerCase();
260 |
261 | // maybe skip this "word"
262 | // stopwords array is from language_data.js
263 | if (
264 | stopwords.indexOf(queryTermLower) !== -1 ||
265 | queryTerm.match(/^\d+$/)
266 | )
267 | return;
268 |
269 | // stem the word
270 | let word = stemmer.stemWord(queryTermLower);
271 | // select the correct list
272 | if (word[0] === "-") excludedTerms.add(word.substr(1));
273 | else {
274 | searchTerms.add(word);
275 | highlightTerms.add(queryTermLower);
276 | }
277 | });
278 |
279 | if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js
280 | localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" "))
281 | }
282 |
283 | // console.debug("SEARCH: searching for:");
284 | // console.info("required: ", [...searchTerms]);
285 | // console.info("excluded: ", [...excludedTerms]);
286 |
287 | // array of [docname, title, anchor, descr, score, filename]
288 | let results = [];
289 | _removeChildren(document.getElementById("search-progress"));
290 |
291 | const queryLower = query.toLowerCase();
292 | for (const [title, foundTitles] of Object.entries(allTitles)) {
293 | if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
294 | for (const [file, id] of foundTitles) {
295 | let score = Math.round(100 * queryLower.length / title.length)
296 | results.push([
297 | docNames[file],
298 | titles[file] !== title ? `${titles[file]} > ${title}` : title,
299 | id !== null ? "#" + id : "",
300 | null,
301 | score,
302 | filenames[file],
303 | ]);
304 | }
305 | }
306 | }
307 |
308 | // search for explicit entries in index directives
309 | for (const [entry, foundEntries] of Object.entries(indexEntries)) {
310 | if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
311 | for (const [file, id] of foundEntries) {
312 | let score = Math.round(100 * queryLower.length / entry.length)
313 | results.push([
314 | docNames[file],
315 | titles[file],
316 | id ? "#" + id : "",
317 | null,
318 | score,
319 | filenames[file],
320 | ]);
321 | }
322 | }
323 | }
324 |
325 | // lookup as object
326 | objectTerms.forEach((term) =>
327 | results.push(...Search.performObjectSearch(term, objectTerms))
328 | );
329 |
330 | // lookup as search terms in fulltext
331 | results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
332 |
333 | // let the scorer override scores with a custom scoring function
334 | if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
335 |
336 | // now sort the results by score (in opposite order of appearance, since the
337 | // display function below uses pop() to retrieve items) and then
338 | // alphabetically
339 | results.sort((a, b) => {
340 | const leftScore = a[4];
341 | const rightScore = b[4];
342 | if (leftScore === rightScore) {
343 | // same score: sort alphabetically
344 | const leftTitle = a[1].toLowerCase();
345 | const rightTitle = b[1].toLowerCase();
346 | if (leftTitle === rightTitle) return 0;
347 | return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
348 | }
349 | return leftScore > rightScore ? 1 : -1;
350 | });
351 |
352 | // remove duplicate search results
353 | // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
354 | let seen = new Set();
355 | results = results.reverse().reduce((acc, result) => {
356 | let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(',');
357 | if (!seen.has(resultStr)) {
358 | acc.push(result);
359 | seen.add(resultStr);
360 | }
361 | return acc;
362 | }, []);
363 |
364 | results = results.reverse();
365 |
366 | // for debugging
367 | //Search.lastresults = results.slice(); // a copy
368 | // console.info("search results:", Search.lastresults);
369 |
370 | // print the results
371 | _displayNextItem(results, results.length, searchTerms, highlightTerms);
372 | },
373 |
374 | /**
375 | * search for object names
376 | */
377 | performObjectSearch: (object, objectTerms) => {
378 | const filenames = Search._index.filenames;
379 | const docNames = Search._index.docnames;
380 | const objects = Search._index.objects;
381 | const objNames = Search._index.objnames;
382 | const titles = Search._index.titles;
383 |
384 | const results = [];
385 |
386 | const objectSearchCallback = (prefix, match) => {
387 | const name = match[4]
388 | const fullname = (prefix ? prefix + "." : "") + name;
389 | const fullnameLower = fullname.toLowerCase();
390 | if (fullnameLower.indexOf(object) < 0) return;
391 |
392 | let score = 0;
393 | const parts = fullnameLower.split(".");
394 |
395 | // check for different match types: exact matches of full name or
396 | // "last name" (i.e. last dotted part)
397 | if (fullnameLower === object || parts.slice(-1)[0] === object)
398 | score += Scorer.objNameMatch;
399 | else if (parts.slice(-1)[0].indexOf(object) > -1)
400 | score += Scorer.objPartialMatch; // matches in last name
401 |
402 | const objName = objNames[match[1]][2];
403 | const title = titles[match[0]];
404 |
405 | // If more than one term searched for, we require other words to be
406 | // found in the name/title/description
407 | const otherTerms = new Set(objectTerms);
408 | otherTerms.delete(object);
409 | if (otherTerms.size > 0) {
410 | const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase();
411 | if (
412 | [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0)
413 | )
414 | return;
415 | }
416 |
417 | let anchor = match[3];
418 | if (anchor === "") anchor = fullname;
419 | else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname;
420 |
421 | const descr = objName + _(", in ") + title;
422 |
423 | // add custom score for some objects according to scorer
424 | if (Scorer.objPrio.hasOwnProperty(match[2]))
425 | score += Scorer.objPrio[match[2]];
426 | else score += Scorer.objPrioDefault;
427 |
428 | results.push([
429 | docNames[match[0]],
430 | fullname,
431 | "#" + anchor,
432 | descr,
433 | score,
434 | filenames[match[0]],
435 | ]);
436 | };
437 | Object.keys(objects).forEach((prefix) =>
438 | objects[prefix].forEach((array) =>
439 | objectSearchCallback(prefix, array)
440 | )
441 | );
442 | return results;
443 | },
444 |
445 | /**
446 | * search for full-text terms in the index
447 | */
448 | performTermsSearch: (searchTerms, excludedTerms) => {
449 | // prepare search
450 | const terms = Search._index.terms;
451 | const titleTerms = Search._index.titleterms;
452 | const filenames = Search._index.filenames;
453 | const docNames = Search._index.docnames;
454 | const titles = Search._index.titles;
455 |
456 | const scoreMap = new Map();
457 | const fileMap = new Map();
458 |
459 | // perform the search on the required terms
460 | searchTerms.forEach((word) => {
461 | const files = [];
462 | const arr = [
463 | { files: terms[word], score: Scorer.term },
464 | { files: titleTerms[word], score: Scorer.title },
465 | ];
466 | // add support for partial matches
467 | if (word.length > 2) {
468 | const escapedWord = _escapeRegExp(word);
469 | Object.keys(terms).forEach((term) => {
470 | if (term.match(escapedWord) && !terms[word])
471 | arr.push({ files: terms[term], score: Scorer.partialTerm });
472 | });
473 | Object.keys(titleTerms).forEach((term) => {
474 | if (term.match(escapedWord) && !titleTerms[word])
475 | arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
476 | });
477 | }
478 |
479 | // no match but word was a required one
480 | if (arr.every((record) => record.files === undefined)) return;
481 |
482 | // found search word in contents
483 | arr.forEach((record) => {
484 | if (record.files === undefined) return;
485 |
486 | let recordFiles = record.files;
487 | if (recordFiles.length === undefined) recordFiles = [recordFiles];
488 | files.push(...recordFiles);
489 |
490 | // set score for the word in each file
491 | recordFiles.forEach((file) => {
492 | if (!scoreMap.has(file)) scoreMap.set(file, {});
493 | scoreMap.get(file)[word] = record.score;
494 | });
495 | });
496 |
497 | // create the mapping
498 | files.forEach((file) => {
499 | if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
500 | fileMap.get(file).push(word);
501 | else fileMap.set(file, [word]);
502 | });
503 | });
504 |
505 | // now check if the files don't contain excluded terms
506 | const results = [];
507 | for (const [file, wordList] of fileMap) {
508 | // check if all requirements are matched
509 |
510 | // as search terms with length < 3 are discarded
511 | const filteredTermCount = [...searchTerms].filter(
512 | (term) => term.length > 2
513 | ).length;
514 | if (
515 | wordList.length !== searchTerms.size &&
516 | wordList.length !== filteredTermCount
517 | )
518 | continue;
519 |
520 | // ensure that none of the excluded terms is in the search result
521 | if (
522 | [...excludedTerms].some(
523 | (term) =>
524 | terms[term] === file ||
525 | titleTerms[term] === file ||
526 | (terms[term] || []).includes(file) ||
527 | (titleTerms[term] || []).includes(file)
528 | )
529 | )
530 | break;
531 |
532 | // select one (max) score for the file.
533 | const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w]));
534 | // add result to the result list
535 | results.push([
536 | docNames[file],
537 | titles[file],
538 | "",
539 | null,
540 | score,
541 | filenames[file],
542 | ]);
543 | }
544 | return results;
545 | },
546 |
547 | /**
548 | * helper function to return a node containing the
549 | * search summary for a given text. keywords is a list
550 | * of stemmed words.
551 | */
552 | makeSearchSummary: (htmlText, keywords) => {
553 | const text = Search.htmlToText(htmlText);
554 | if (text === "") return null;
555 |
556 | const textLower = text.toLowerCase();
557 | const actualStartPosition = [...keywords]
558 | .map((k) => textLower.indexOf(k.toLowerCase()))
559 | .filter((i) => i > -1)
560 | .slice(-1)[0];
561 | const startWithContext = Math.max(actualStartPosition - 120, 0);
562 |
563 | const top = startWithContext === 0 ? "" : "...";
564 | const tail = startWithContext + 240 < text.length ? "..." : "";
565 |
566 | let summary = document.createElement("p");
567 | summary.classList.add("context");
568 | summary.textContent = top + text.substr(startWithContext, 240).trim() + tail;
569 |
570 | return summary;
571 | },
572 | };
573 |
574 | _ready(Search.init);
575 |
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/session.py:
--------------------------------------------------------------------------------
```python
1 | import curses
2 | import sys
3 | import threading
4 | import traceback
5 |
6 | from pathlib import Path
7 | from typing import Optional
8 |
9 | from mcp_server_webcrawl.crawlers import get_crawler
10 | from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler, BaseJsonApi
11 | from mcp_server_webcrawl.interactive.search import SearchManager
12 | from mcp_server_webcrawl.interactive.ui import ThemeDefinition, UiState, DocumentMode, UiFocusable, ViewBounds, safe_addstr
13 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView, OUTER_WIDTH_RIGHT_MARGIN
14 | from mcp_server_webcrawl.interactive.views.document import SearchDocumentView
15 | from mcp_server_webcrawl.interactive.views.requirements import RequirementsView
16 | from mcp_server_webcrawl.interactive.views.results import SearchResultsView
17 | from mcp_server_webcrawl.interactive.views.searchform import SearchFormView
18 | from mcp_server_webcrawl.interactive.views.help import HelpView
19 | from mcp_server_webcrawl.models.sites import SiteResult
20 |
21 | # can be as low as 1, 50 feels a little laggy
22 | CURSES_TIMEOUT_MS = 25
23 |
24 | LAYOUT_CONTENT_START_Y_OFFSET = 1
25 | LAYOUT_CONTENT_END_Y_OFFSET = 1
26 | LAYOUT_SPLIT_PANE_MAX_HEIGHT = 10
27 | LAYOUT_MIN_HEIGHT_FOR_HELP = 2
28 |
29 | DEBUG_MAX_LINES = 8
30 | DEBUG_COMPACT_WIDTH_RATIO = 0.4
31 | DEBUG_MIN_COMPACT_WIDTH = 30
32 | DEBUG_COMPACT_THRESHOLD = 5
33 | DEBUG_EXPANDED_MARGIN = 6
34 | DEBUG_EXPANDED_START_X = 3
35 | DEBUG_EXPANDED_BOTTOM_MARGIN = 3
36 | DEBUG_COMPACT_BOTTOM_MARGIN = 2
37 | DEBUG_MIN_START_Y = 1
38 | DEBUG_MIN_START_Y_EXPANDED = 2
39 |
40 | SEARCH_DOCUMENT_NEXT_MODE: dict[DocumentMode, DocumentMode] = {
41 | DocumentMode.MARKDOWN: DocumentMode.RAW,
42 | DocumentMode.RAW: DocumentMode.HEADERS,
43 | DocumentMode.HEADERS: DocumentMode.MARKDOWN
44 | }
45 |
46 | SEARCH_RESULT_LIMIT: int = 10
47 | TERMINAL_MIN_HEIGHT: int = 8
48 | TERMINAL_MIN_WIDTH: int = 40
49 |
50 | class InteractiveSession:
51 | """
52 | Main session coordinator that manages the interactive terminal application.
53 | """
54 |
55 | def __init__(self, crawler: str, datasrc: str):
56 | """
57 | Initialize the interactive session with crawler and data source.
58 | """
59 | self.__input_crawler: str = crawler
60 | self.__input_datasrc: str = datasrc
61 | self.__theme_map: dict[str, int] = {}
62 | self.__searchman: SearchManager = SearchManager(self)
63 | self.__ui_state: UiState = UiState.SEARCH_INIT
64 | self.__ui_focused: UiFocusable = UiFocusable.SEARCH_FORM
65 | self.__debug: list[str] = []
66 |
67 | self.__view__requirements = RequirementsView(self, crawler, datasrc)
68 | if self.__view__requirements.validated == True:
69 | crawl_model = get_crawler(crawler)
70 | if crawl_model is not None:
71 | self.__crawler: BaseCrawler = crawl_model(Path(datasrc))
72 | sites_api: BaseJsonApi = self.__crawler.get_sites_api()
73 | self.__sites: list[SiteResult] = sites_api.get_results()
74 | else:
75 | self.__crawler: BaseCrawler = None
76 | sites_api: BaseJsonApi = None
77 | self.__sites: list[SiteResult] = []
78 | else:
79 | crawl_model = None
80 | self.__crawler: BaseCrawler = None
81 | sites_api: BaseJsonApi = None
82 | self.__sites: list[SiteResult] = []
83 |
84 | self.__view__results = SearchResultsView(self)
85 | self.__view__document = SearchDocumentView(self)
86 | self.__view__searchform = SearchFormView(self, self.__sites)
87 | self.__view__help = HelpView(self)
88 |
89 | self.set_ui_state(UiState.SEARCH_INIT, UiFocusable.SEARCH_FORM)
90 |
91 | @property
92 | def ui_state(self) -> UiState:
93 | return self.__ui_state
94 |
95 | @property
96 | def ui_focused(self) -> UiFocusable:
97 | return self.__ui_focused
98 |
99 | @property
100 | def crawler(self) -> BaseCrawler:
101 | return self.__crawler
102 |
103 | @property
104 | def document(self) -> SearchDocumentView:
105 | return self.__view__document
106 |
107 | @property
108 | def results(self) -> SearchResultsView:
109 | return self.__view__results
110 |
111 | @property
112 | def searchform(self) -> SearchFormView:
113 | return self.__view__searchform
114 |
115 | @property
116 | def searchman(self) -> SearchManager:
117 | return self.__searchman
118 |
119 | @property
120 | def sites(self) -> list[SiteResult]:
121 | return self.__sites.copy()
122 |
123 | def debug_add(self, msg: str) -> None:
124 | """
125 | Add line of debug.
126 | """
127 | with threading.Lock():
128 | self.__debug.append(msg)
129 |
130 | def debug_clear(self) -> None:
131 | """
132 | Clear debug statements.
133 | """
134 | with threading.Lock():
135 | self.__debug.clear()
136 |
137 | def run(self) -> None:
138 | """
139 | Public interface to launch the interactive terminal application.
140 | """
141 | try:
142 | curses.wrapper(self.__curses_main)
143 | except KeyboardInterrupt:
144 | pass # clean exit, ctrl+c
145 | except Exception as ex:
146 | print(f"--interactive failure: {ex}\n{traceback.format_exc()}", file=sys.stderr)
147 | finally:
148 | self.searchman.cleanup()
149 | pass
150 |
151 | def set_ui_state(self, state: UiState, focus: Optional[UiFocusable] = None) -> None:
152 | """
153 | Transition between UI states cleanly.
154 | """
155 | self.__ui_state = state
156 | if focus is not None:
157 | self.__ui_focused = focus
158 |
159 | self.__view__results.set_focused(False)
160 | self.__view__searchform.set_focused(False)
161 | if state == UiState.SEARCH_INIT or (state == UiState.SEARCH_RESULTS and focus == UiFocusable.SEARCH_FORM):
162 | self.__view__searchform.set_focused(True)
163 | elif state == UiState.SEARCH_RESULTS:
164 | self.__view__results.set_focused(True)
165 |
166 | # used in requirements view to reset with user inputs over cmd args
167 | def set_init_input_args(self, crawler: str, datasrc: str) -> None:
168 | self.__input_crawler = crawler
169 | self.__input_datasrc = datasrc
170 |
171 | def set_init_crawler(self, crawler: BaseCrawler) -> None:
172 | self.__crawler = crawler
173 |
174 | def set_init_sites(self, sites: str) -> None:
175 | self.__sites = sites
176 |
177 | # used in requirements to reset app
178 | def set_init_searchform(self, searchform: BaseCursesView) -> None:
179 | self.__view__searchform = searchform
180 |
181 | def __get_outer_screen(self, width: int, height: int) -> ViewBounds:
182 | """
183 | Get the outer screen bounds for the full terminal.
184 | """
185 | return ViewBounds(
186 | x=0,
187 | y=0,
188 | width=width - OUTER_WIDTH_RIGHT_MARGIN,
189 | height=height
190 | )
191 |
192 | def __get_inner_screen(self, width: int, height: int) -> ViewBounds:
193 | """
194 | Get the inner screen bounds for content area.
195 | """
196 | content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
197 | content_end_y = height - LAYOUT_CONTENT_END_Y_OFFSET
198 | content_height = content_end_y - content_start_y
199 |
200 | return ViewBounds(
201 | x=0,
202 | y=content_start_y, # after outer header
203 | width=width - OUTER_WIDTH_RIGHT_MARGIN,
204 | height=content_height
205 | )
206 |
207 | def __get_split_top(self, width: int, height: int) -> ViewBounds:
208 | """
209 | Get the top split screen bounds for dual-pane layout.
210 | """
211 | content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
212 | content_height = height - 2
213 | split_top_height = min(LAYOUT_SPLIT_PANE_MAX_HEIGHT, content_height // 2)
214 |
215 | return ViewBounds(
216 | x=0,
217 | y=content_start_y,
218 | width=width - OUTER_WIDTH_RIGHT_MARGIN,
219 | height=split_top_height
220 | )
221 |
222 | def __get_split_bottom(self, width: int, height: int) -> ViewBounds:
223 | """
224 | Get the bottom split screen bounds for dual-pane layout.
225 | """
226 | content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
227 | content_height = height - 2
228 | split_top_height = min(LAYOUT_SPLIT_PANE_MAX_HEIGHT, content_height // 2)
229 | split_bottom_height = content_height - split_top_height
230 |
231 | return ViewBounds(
232 | x=0,
233 | y=content_start_y + split_top_height,
234 | width=width - OUTER_WIDTH_RIGHT_MARGIN,
235 | height=split_bottom_height
236 | )
237 |
238 | def __curses_main(self, stdscr: curses.window) -> None:
239 | """
240 | Initialize curses environment and start main loop.
241 | """
242 |
243 | if curses.COLORS < 256:
244 | # display error in curses, dependable
245 | stdscr.addstr(0, 0, "--interactive mode requires a 256-color (or better) terminal")
246 | stdscr.refresh()
247 | stdscr.getch() # wait for keypress
248 | sys.exit(1)
249 |
250 | # initialize curses style pairs
251 | curses.start_color()
252 | for theme in ThemeDefinition:
253 | self.__theme_map[theme.name] = theme.value
254 | curses.init_pair(*theme.value)
255 |
256 | # hide cursor, otherwise blinks at edge of last write
257 | curses.curs_set(0)
258 |
259 | # start main loop
260 | self.__interactive_loop(stdscr)
261 |
262 | def get_theme_color_pair(self, theme: ThemeDefinition) -> int | None:
263 | if theme.name in self.__theme_map:
264 | return curses.color_pair(self.__theme_map[theme.name][0])
265 | else:
266 | return None
267 |
268 | def __get_help_text(self) -> str:
269 | """
270 | Get context-sensitive help text.
271 | """
272 | page_results: str = " | ←→ Page Results" if self.ui_focused == UiFocusable.SEARCH_RESULTS else ""
273 | search_results_enter: str = "Search" if self.__view__searchform.focused else "View Document"
274 | search_results_tab: str = "Results" if self.__view__searchform.focused else "Search Form"
275 | footers: dict[UiState, str] = {
276 | UiState.DOCUMENT: "↑↓: Scroll | PgUp/PgDn: Page | Home/End: Top/Bot | TAB: Mode | ESC: Back",
277 | UiState.HELP: "↑↓: Scroll | PgUp/PgDn: Page | Home/End: Top/Bot | ESC: Back",
278 | UiState.REQUIREMENTS: "ENTER: Load Interface | ↑↓: Navigate| ESC: Exit",
279 | UiState.SEARCH_INIT: "ENTER: Search | ↑↓: Navigate | F1: Search Help | ESC: Exit",
280 | UiState.SEARCH_RESULTS: f"ENTER: {search_results_enter} | ↑↓: Navigate{page_results} | TAB: {search_results_tab} | ESC: New Search",
281 | }
282 | return footers.get(self.__ui_state, "↑↓: Navigate | ESC: Exit")
283 |
284 | def __handle_F1(self) -> None:
285 | """
286 | Handle F1 key
287 | """
288 | self.set_ui_state(UiState.HELP)
289 |
290 | def __handle_ESC(self) -> None:
291 | """
292 | Handle ESC key
293 | """
294 | if self.__ui_state == UiState.DOCUMENT:
295 | self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
296 | elif self.__ui_state in (UiState.SEARCH_RESULTS, UiState.HELP):
297 | self.set_ui_state(UiState.SEARCH_INIT, UiFocusable.SEARCH_FORM)
298 | self.searchform.clear_query()
299 | elif self.__ui_state in (UiState.SEARCH_INIT, UiState.REQUIREMENTS):
300 | sys.exit(0)
301 |
302 | def __handle_TAB(self) -> None:
303 | """
304 | Handle TAB key
305 | """
306 | if self.__ui_state == UiState.SEARCH_RESULTS:
307 | if self.__ui_focused == UiFocusable.SEARCH_FORM:
308 | self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
309 | else:
310 | self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_FORM)
311 |
312 | def __interactive_loop(self, stdscr: curses.window) -> None:
313 | """
314 | Main input loop.
315 | """
316 |
317 | try:
318 | stdscr.timeout(CURSES_TIMEOUT_MS)
319 |
320 | while True:
321 | self.searchman.check_pending()
322 |
323 | stdscr.clear()
324 | height, width = stdscr.getmaxyx()
325 | selected_sites = self.__view__searchform.get_selected_sites()
326 |
327 | if self.__ui_state == UiState.REQUIREMENTS or self.__view__requirements.validated == False:
328 |
329 | if not self.__ui_state == UiState.REQUIREMENTS:
330 | self.set_ui_state(UiState.REQUIREMENTS)
331 |
332 | inner_screen = self.__get_inner_screen(width, height)
333 | self.__view__requirements.draw_inner_header(stdscr, inner_screen, "Requirements:")
334 | self.__view__requirements.set_bounds(inner_screen)
335 | self.__view__requirements.render(stdscr)
336 | self.__view__requirements.draw_inner_footer(stdscr, inner_screen, f"Waiting on input")
337 |
338 | elif self.__ui_state == UiState.HELP:
339 |
340 | inner_screen = self.__get_inner_screen(width, height)
341 | self.__view__help.draw_inner_header(stdscr, inner_screen, "Search Help:")
342 | self.__view__help.set_bounds(inner_screen)
343 | self.__view__help.render(stdscr)
344 | self.__view__help.draw_inner_footer(stdscr, inner_screen, f"ESC to Exit Help")
345 |
346 | elif self.__ui_state == UiState.SEARCH_RESULTS and selected_sites:
347 |
348 | inner_screen_split_top = self.__get_split_top(width, height)
349 | inner_screen_split_bottom = self.__get_split_bottom(width, height)
350 | url: str = selected_sites[0].urls[0] if selected_sites and selected_sites[0].urls else ""
351 | display_url: str = BaseCursesView.url_for_display(url)
352 | self.__view__searchform.draw_inner_header(stdscr, inner_screen_split_top, "Search:")
353 | self.__view__searchform.set_bounds(inner_screen_split_top)
354 | self.__view__searchform.render(stdscr)
355 | self.__view__searchform.draw_inner_footer(stdscr, inner_screen_split_top, f"Searching {display_url}")
356 | self.__view__results.draw_inner_header(stdscr, inner_screen_split_bottom, "")
357 | self.__view__results.set_bounds(inner_screen_split_bottom)
358 | self.__view__results.render(stdscr)
359 | self.__view__results.draw_inner_footer(stdscr, inner_screen_split_bottom, "")
360 |
361 | elif self.__ui_state == UiState.DOCUMENT:
362 |
363 | inner_screen = self.__get_inner_screen(width, height)
364 | url: str = self.__view__document.urls[0] if self.__view__document is not None and self.__view__document.urls else ""
365 | display_url: str = BaseCursesView.url_for_display(url)
366 | self.__view__document.set_focused(True)
367 | self.__view__document.draw_inner_header(stdscr, inner_screen, f"URL: {display_url}")
368 | self.__view__document.set_bounds(inner_screen)
369 | self.__view__document.render(stdscr)
370 | self.__view__document.draw_inner_footer(stdscr, inner_screen, f"")
371 |
372 | else:
373 |
374 | # aka self.__ui_state == UiState.SEARCH_INIT
375 | inner_screen = self.__get_inner_screen(width, height)
376 | self.__view__searchform.draw_inner_header(stdscr, inner_screen, "Search:")
377 | selected_sites = self.__view__searchform.get_selected_sites()
378 | first_hit = selected_sites[0] if selected_sites else None
379 | url: str = first_hit.urls[0] if first_hit is not None and first_hit.urls else ""
380 | display_url: str = BaseCursesView.url_for_display(url)
381 | self.__view__searchform.set_bounds(inner_screen)
382 | self.__view__searchform.render(stdscr)
383 | self.__view__searchform.draw_inner_footer(stdscr, inner_screen, f"Searching {display_url}")
384 |
385 | if height > LAYOUT_MIN_HEIGHT_FOR_HELP:
386 | help_text = self.__get_help_text()
387 | self.__view__searchform.draw_outer_header(stdscr)
388 | self.__view__searchform.draw_outer_footer(stdscr, help_text)
389 |
390 | self.__render_debug(stdscr)
391 | stdscr.refresh()
392 |
393 | key: int = stdscr.getch()
394 | if key == -1: # timeout
395 | continue
396 | elif key == ord('\t'):
397 | self.__handle_TAB()
398 | elif key == curses.KEY_F1:
399 | self.__handle_F1()
400 | elif key == 27: # ESC
401 | self.__handle_ESC()
402 |
403 | if self.__view__requirements.validated == False or self.__ui_state == UiState.REQUIREMENTS:
404 | if self.__view__requirements.handle_input(key):
405 | continue
406 | elif self.__ui_state == UiState.SEARCH_INIT or (
407 | self.__ui_state == UiState.SEARCH_RESULTS
408 | and self.__ui_focused == UiFocusable.SEARCH_FORM
409 | ):
410 | if self.__view__searchform.handle_input(key):
411 | continue
412 | elif self.__ui_state == UiState.SEARCH_RESULTS:
413 | if self.__view__results.handle_input(key):
414 | continue
415 | elif self.__ui_state == UiState.DOCUMENT:
416 | if self.__view__document.handle_input(key):
417 | continue
418 | elif self.__ui_state == UiState.HELP:
419 | if self.__view__help.handle_input(key):
420 | continue
421 |
422 | except Exception as ex:
423 | print(f"--interactive failure - {ex}\n{traceback.format_exc()}")
424 | pass
425 | finally:
426 | stdscr.timeout(-1)
427 |
428 | def __render_debug(self, stdscr: curses.window) -> None:
429 | """
430 | Render debug info with adaptive sizing - compact for short messages, expanded for errors.
431 | """
432 | height, width = stdscr.getmaxyx()
433 |
434 | with threading.Lock():
435 | debug_lines = self.__debug[-(DEBUG_MAX_LINES):].copy()
436 |
437 | if not debug_lines:
438 | return
439 |
440 | max_line_length = max(len(line) for line in debug_lines) if debug_lines else 0
441 | compact_width = max(int(width * DEBUG_COMPACT_WIDTH_RATIO), DEBUG_MIN_COMPACT_WIDTH)
442 | use_expanded = max_line_length > compact_width - DEBUG_COMPACT_THRESHOLD
443 |
444 | if use_expanded:
445 | debug_width: int = width - DEBUG_EXPANDED_MARGIN
446 | debug_start_x: int = DEBUG_EXPANDED_START_X
447 | debug_start_y: int = max(DEBUG_MIN_START_Y_EXPANDED, height - len(debug_lines) - DEBUG_EXPANDED_BOTTOM_MARGIN)
448 | else:
449 | debug_width: int = compact_width
450 | debug_start_x: int = width - debug_width - DEBUG_EXPANDED_START_X
451 | debug_start_y: int = height - len(debug_lines) - DEBUG_COMPACT_BOTTOM_MARGIN
452 |
453 | debug_start_y: int = max(DEBUG_MIN_START_Y, debug_start_y)
454 | debug_start_x: int = max(0, debug_start_x)
455 |
456 | for i, debug_line in enumerate(debug_lines):
457 | y_pos: int = debug_start_y + i
458 | if y_pos >= height - 1:
459 | break
460 | if debug_start_x >= 0 and y_pos > 0:
461 | display_line: str = debug_line[:debug_width]
462 | safe_addstr(stdscr, y_pos, debug_start_x, display_line, self.get_theme_color_pair(ThemeDefinition.HEADER_ACTIVE))
463 |
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/templates/tests.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.templates.tests — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item active">mcp_server_webcrawl.templates.tests</li>
67 | <li class="wy-breadcrumbs-aside">
68 | </li>
69 | </ul>
70 | <hr/>
71 | </div>
72 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
73 | <div itemprop="articleBody">
74 |
75 | <h1>Source code for mcp_server_webcrawl.templates.tests</h1><div class="highlight"><pre>
76 | <span></span><span class="kn">import</span> <span class="nn">re</span>
77 | <span class="kn">import</span> <span class="nn">unittest</span>
78 |
79 | <span class="kn">from</span> <span class="nn">importlib</span> <span class="kn">import</span> <span class="n">resources</span>
80 | <span class="kn">from</span> <span class="nn">urllib.request</span> <span class="kn">import</span> <span class="n">urlopen</span>
81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.extras.markdown</span> <span class="kn">import</span> <span class="n">get_markdown</span>
83 |
84 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
85 |
86 | <div class="viewcode-block" id="TemplateTests">
87 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests">[docs]</a>
88 | <span class="k">class</span> <span class="nc">TemplateTests</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
89 | <span class="w"> </span><span class="sd">"""</span>
90 | <span class="sd"> Test suite for the custom HTML to markdown converter.</span>
91 | <span class="sd"> Why custom? It's a bit faster, that is the only reason.</span>
92 | <span class="sd"> Maximum load is 100 transforms (1 per result for a max result </span>
93 | <span class="sd"> of 100), so speed matters. A default set is 20.</span>
94 | <span class="sd"> This converter does a few things differently to tailor to LLM</span>
95 | <span class="sd"> interaction.</span>
96 | <span class="sd"> * aggressively removes images (html2text selectively renders)</span>
97 | <span class="sd"> * links with block decendents will render like a <p> </span>
98 | <span class="sd"> (html2text treats as <a><br>) </span>
99 | <span class="sd"> """</span>
100 |
101 | <div class="viewcode-block" id="TemplateTests.setUp">
102 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests.setUp">[docs]</a>
103 | <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
104 | <span class="w"> </span><span class="sd">"""</span>
105 | <span class="sd"> Set up the test environment with fixture data.</span>
106 | <span class="sd"> """</span>
107 | <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span></div>
108 |
109 |
110 | <div class="viewcode-block" id="TemplateTests.test_core_html">
111 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html">[docs]</a>
112 | <span class="k">def</span> <span class="nf">test_core_html</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
113 | <span class="n">core_html</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">resources</span><span class="o">.</span><span class="n">read_text</span><span class="p">(</span><span class="s2">"mcp_server_webcrawl.templates"</span><span class="p">,</span> <span class="s2">"tests_core.html"</span><span class="p">)</span>
114 | <span class="n">markdown</span> <span class="o">=</span> <span class="n">get_markdown</span><span class="p">(</span><span class="n">core_html</span><span class="p">)</span>
115 |
116 | <span class="c1"># h1-6</span>
117 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"# Lorem Ipsum Dolor Sit Amet"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
118 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"## Consectetur Adipiscing Elit"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
119 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"### Nemo Enim Ipsam Voluptatem"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
120 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"#### Sed Quia Non Numquam"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
121 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"##### Nisi Ut Aliquid Ex Ea"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
122 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"###### At Vero Eos Et Accusamus"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
123 |
124 | <span class="c1"># no content loss - key phrases should be preserved</span>
125 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"Lorem ipsum dolor sit amet"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
126 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"Definition List Example"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
127 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"More Text Elements"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
128 |
129 | <span class="c1"># inline formatting (proper spacing)</span>
130 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"amet, **consectetur adipiscing elit**. Sed"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
131 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"laborum. **Sed ut perspiciatis** unde"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
132 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"consequat. *Duis aute irure dolor* in"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
133 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"laudantium. *Totam rem aperiam*, eaque"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
134 |
135 | <span class="c1"># link formatting (proper spacing)</span>
136 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"veniam, quis nostrud exercitation ullamco"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># Fragment links as plain text</span>
137 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"and a link back to top. Nam"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
138 |
139 | <span class="c1"># list formatting</span>
140 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"* Similique sunt in culpa"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
141 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"1. Temporibus autem quibusdam"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
142 |
143 | <span class="c1"># dl/dt</span>
144 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"**Lorem Ipsum**"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
145 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">" Dolor sit amet, consectetur adipiscing elit"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
146 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"**Ut Enim**"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
147 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">" Ad minim veniam, quis nostrud exercitation"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
148 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"**Duis Aute**"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
149 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">" Irure dolor in reprehenderit in voluptate"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
150 |
151 | <span class="c1"># table structure</span>
152 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"| Lorem | Ipsum | Dolor | Sit |"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
153 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"|---|---|---|---|"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
154 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"| Consectetur | Adipiscing | Elit | Sed |"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
155 |
156 | <span class="c1"># code formatting</span>
157 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"Here we have some `inline code` and"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
158 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"```</span><span class="se">\n</span><span class="s2">function lorem() {</span><span class="se">\n</span><span class="s2"> return </span><span class="se">\"</span><span class="s2">ipsum dolor sit amet</span><span class="se">\"</span><span class="s2">;</span><span class="se">\n</span><span class="s2">}</span><span class="se">\n</span><span class="s2">```"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
159 |
160 | <span class="c1"># blockquotes</span>
161 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"> </span><span class="se">\"</span><span class="s2">Sed ut perspiciatis unde omnis iste natus"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
162 |
163 | <span class="c1"># horizontal rule</span>
164 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"---"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
165 |
166 | <span class="c1"># no double spacing for inline elements</span>
167 | <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">"** "</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces after bold</span>
168 | <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">" **"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces before bold</span>
169 | <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">"* "</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces after emphasis</span>
170 | <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">" *"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span> <span class="c1"># No double spaces before emphasis</span>
171 |
172 | <span class="c1"># structural integrity - count major elements</span>
173 | <span class="n">heading_count</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">"^#{1,6} "</span><span class="p">,</span> <span class="n">markdown</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">MULTILINE</span><span class="p">))</span>
174 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">heading_count</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="s2">"Should have exactly 6 headings"</span><span class="p">)</span>
175 | <span class="n">table_count</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">"^\|.*\|$"</span><span class="p">,</span> <span class="n">markdown</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">MULTILINE</span><span class="p">))</span>
176 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">table_count</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="s2">"Should have multiple table rows"</span><span class="p">)</span></div>
177 | </div>
178 |
179 |
180 | </pre></div>
181 |
182 | </div>
183 | </div>
184 | <footer>
185 |
186 | <hr/>
187 |
188 | <div role="contentinfo">
189 | <p>© Copyright 2025, pragmar.</p>
190 | </div>
191 |
192 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
193 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
194 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
195 |
196 |
197 | </footer>
198 | </div>
199 | </div>
200 | </section>
201 | </div>
202 | <script>
203 | jQuery(function () {
204 | SphinxRtdTheme.Navigation.enable(true);
205 | });
206 | </script>
207 |
208 | </body>
209 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/searchform.py:
--------------------------------------------------------------------------------
```python
1 | import curses
2 |
3 | from typing import TYPE_CHECKING
4 |
5 | from mcp_server_webcrawl.interactive.ui import (
6 | UiState, InputRadio, InputRadioGroup, InputText,
7 | ThemeDefinition, NavigationDirection, safe_addstr
8 | )
9 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView
10 | from mcp_server_webcrawl.models.sites import SiteResult
11 | from mcp_server_webcrawl.interactive.ui import safe_addstr
12 |
13 | if TYPE_CHECKING:
14 | from mcp_server_webcrawl.interactive.session import InteractiveSession
15 |
16 | LAYOUT_QUERY_MAX_WIDTH = 50
17 | LAYOUT_QUERY_MARGIN = 11
18 | LAYOUT_QUERY_OFFSET = 9
19 | LAYOUT_FILTER_COLUMN_PADDING = 8
20 | LAYOUT_SORT_COLUMN_PADDING = 6
21 | LAYOUT_FILTER_TO_SORT_SPACING = 8
22 | LAYOUT_SORT_TO_SITES_SPACING = 6
23 | LAYOUT_SITE_COLUMN_WIDTH = 22
24 | LAYOUT_SITE_COLUMN_SPACING = 2
25 | LAYOUT_SITES_VERTICAL_OFFSET = 6
26 | LAYOUT_SITES_MIN_WIDTH_REQUIREMENT = 16
27 | LAYOUT_CONSTRAINED_SITES_PER_COLUMN = 3
28 | LAYOUT_TRUNCATED_LABEL_MAX_LENGTH = 18
29 | LAYOUT_OVERFLOW_INDICATOR_MARGIN = 2
30 |
31 | class SearchFormNavigationGrid:
32 | def __init__(self, ui_state: UiState, filter_group: InputRadioGroup, sort_group: InputRadioGroup,
33 | sites_group: InputRadioGroup, sites_per_column: int):
34 | """
35 | Create virtual grid for navigation:
36 | query(0)
37 | filter0 sort0 site0 site3 site6
38 | filter1 sort1 site1 site4 site7
39 | sort2 site2 site5 site8+
40 | """
41 | self.__grid: dict[tuple[int, int], int] = {}
42 | self.__reverse_grid: dict[int, tuple[int, int]] = {}
43 |
44 | # query spans columns 0-2, row 0
45 | for col in range(3):
46 | self.__grid[(0, col)] = 0
47 | self.__reverse_grid[0] = (0, 0)
48 |
49 | for i, _ in enumerate(filter_group.radios):
50 | row = 1 + i
51 | index = 1 + i # filter indices start at 1
52 | self.__grid[(row, 0)] = index
53 | self.__reverse_grid[index] = (row, 0)
54 |
55 | sort_start_index = 1 + len(filter_group.radios)
56 | for i, _ in enumerate(sort_group.radios):
57 | row = 1 + i
58 | index = sort_start_index + i
59 | self.__grid[(row, 1)] = index
60 | self.__reverse_grid[index] = (row, 1)
61 |
62 | sites_start_index = 1 + len(filter_group.radios) + len(sort_group.radios)
63 | self.__ui_state = ui_state
64 |
65 | for i, _ in enumerate(sites_group.radios):
66 | row = 1 + (i % sites_per_column)
67 | col = 2 + (i // sites_per_column)
68 | index = sites_start_index + i
69 | self.__grid[(row, col)] = index
70 | self.__reverse_grid[index] = (row, col)
71 |
72 | def __rightmost_column(self, row: int) -> int:
73 | """
74 | Get the rightmost column that has content in the given row.
75 | """
76 | max_col = -1
77 | for (r, c) in self.__grid.keys():
78 | if r == row:
79 | max_col = max(max_col, c)
80 | return max_col
81 |
82 | def __leftmost_column(self, row: int) -> int:
83 | """
84 | Get the leftmost column that has content in the given row.
85 | """
86 | min_col = float('inf')
87 | for (r, c) in self.__grid.keys():
88 | if r == row:
89 | min_col = min(min_col, c)
90 | return min_col if min_col != float('inf') else -1
91 |
92 | def left(self, current_index: int) -> int | None:
93 | """
94 | Navigate left from current index. Wraps to rightmost element if at left edge.
95 | """
96 | if current_index not in self.__reverse_grid:
97 | return None
98 |
99 | row, col = self.__reverse_grid[current_index]
100 |
101 | # move normally if destination exists
102 | if col > 0:
103 | new_pos = (row, col - 1)
104 | if new_pos in self.__grid:
105 | return self.__grid[new_pos]
106 |
107 | # wrap on edge
108 | rightmost_col = self.__rightmost_column(row)
109 | if rightmost_col >= 0 and rightmost_col != col:
110 | wrap_pos = (row, rightmost_col)
111 | return self.__grid.get(wrap_pos)
112 |
113 | return None
114 |
115 | def right(self, current_index: int) -> int | None:
116 | """
117 | Navigate right from current index. Wraps to leftmost element if at right edge.
118 | """
119 | if current_index not in self.__reverse_grid:
120 | return None
121 |
122 | row, col = self.__reverse_grid[current_index]
123 |
124 | # move normally if destination exists
125 | new_pos = (row, col + 1)
126 | if new_pos in self.__grid:
127 | return self.__grid[new_pos]
128 |
129 | # wrap on edge
130 | leftmost_col = self.__leftmost_column(row)
131 | if leftmost_col >= 0 and leftmost_col != col:
132 | wrap_pos = (row, leftmost_col)
133 | return self.__grid.get(wrap_pos)
134 |
135 | return None
136 |
137 | def up(self, current_index: int) -> int | None:
138 | """
139 | Navigate up from current index. From any radio column goes to query(0).
140 | """
141 | if current_index not in self.__reverse_grid:
142 | return None
143 |
144 | row, col = self.__reverse_grid[current_index]
145 | if row == 0:
146 | return None
147 | if row == 1:
148 | return 0
149 |
150 | # otherwise, move up normally
151 | if row > 1:
152 | new_pos = (row - 1, col)
153 | return self.__grid.get(new_pos)
154 |
155 | return None
156 |
157 | def down(self, current_index: int) -> int | None:
158 | """
159 | Navigate down from current index.
160 | """
161 | if current_index not in self.__reverse_grid:
162 | return None
163 |
164 | # In SEARCH_INIT mode, advance by one
165 | if self.__ui_state == UiState.SEARCH_INIT:
166 | return current_index + 1 if current_index + 1 in self.__reverse_grid else None
167 |
168 | row, col = self.__reverse_grid[current_index]
169 | new_pos = (row + 1, col)
170 | return self.__grid.get(new_pos)
171 |
172 |
173 | class SearchFormView(BaseCursesView):
174 | """
175 | Handles search form state and rendering.
176 | Takes over all the form_* properties and methods from session.
177 | """
178 |
179 | def __init__(self, session: 'InteractiveSession', sites: list[SiteResult]):
180 | """
181 | Initialize the search form view.
182 |
183 | Args:
184 | session: The interactive session instance
185 | sites: List of available sites for selection
186 | """
187 | super().__init__(session)
188 | self.__search_attempted: bool = False
189 | self.__sites: list[SiteResult] = sites
190 | self.__sites_selected: list[SiteResult] = []
191 | self.__query_input = InputText(initial_value="", label="Query")
192 | self.__limit = 10
193 | self.__offset = 0
194 |
195 | if sites:
196 | self.__sites_selected.append(self.__sites[0])
197 |
198 | self.__filter_group: InputRadioGroup = InputRadioGroup("filter")
199 | self.__sort_group: InputRadioGroup = InputRadioGroup("sort")
200 | self.__sites_group: InputRadioGroup = InputRadioGroup("site", sites=self.__sites)
201 |
202 | @property
203 | def filter(self) -> str:
204 | return self.__filter_group.value
205 |
206 | @property
207 | def limit(self) -> str:
208 | return self.__limit
209 |
210 | @property
211 | def offset(self) -> str:
212 | return self.__offset
213 |
214 | @property
215 | def query(self) -> str:
216 | return self.__query_input.value
217 |
218 | @property
219 | def sort(self) -> str:
220 | return self.__sort_group.value.lower() if self.__sort_group.value is not None else "+url"
221 |
222 | def clear_query(self) -> None:
223 | """
224 | Clear only the query, preserve selections (was session method).
225 | """
226 | self.__search_attempted = False
227 | self.__query_input.clear()
228 | self._selected_index = 0
229 | self.__offset = 0
230 |
231 | def focus(self):
232 | """
233 | Set focus on this view.
234 | """
235 | self._focused = True
236 |
237 | def get_selected_sites(self) -> list[SiteResult]:
238 | return self.__sites_selected.copy()
239 |
240 | def handle_input(self, key: int) -> bool:
241 | """
242 | Handle keyboard input and trigger search when state changes.
243 |
244 | Args:
245 | key: The curses key code from user input
246 |
247 | Returns:
248 | bool: True if the input was handled, False otherwise
249 | """
250 |
251 | handlers: dict[int, callable] = {
252 | curses.KEY_UP: lambda: self.__navigate_form_selection(NavigationDirection.UP),
253 | curses.KEY_DOWN: lambda: self.__navigate_form_selection(NavigationDirection.DOWN),
254 | curses.KEY_LEFT: lambda: self.__handle_horizontal_arrow(NavigationDirection.LEFT),
255 | curses.KEY_RIGHT: lambda: self.__handle_horizontal_arrow(NavigationDirection.RIGHT),
256 | ord(' '): self.__handle_spacebar,
257 | ord('\n'): self.__handle_enter,
258 | ord('\r'): self.__handle_enter,
259 | }
260 |
261 | handler = handlers.get(key)
262 | if handler:
263 | handler()
264 | return True
265 |
266 | if self._selected_index == 0:
267 | if self.__query_input.handle_input(key):
268 | self.session.searchman.autosearch()
269 | return True
270 |
271 | return False
272 |
273 | def page_next(self, total_results: int) -> bool:
274 | """
275 | Navigate to next page.
276 |
277 | Args:
278 | total_results: Total number of search results available
279 |
280 | Returns:
281 | bool: True if page was changed, False otherwise
282 | """
283 | if self.__offset + self.__limit < total_results:
284 | self.__offset += self.__limit
285 | return True
286 | return False
287 |
288 | def page_previous(self) -> bool:
289 | """
290 | Navigate to previous page.
291 |
292 | Returns:
293 | bool: True if page was changed, False otherwise
294 | """
295 | if self.__offset >= self.__limit:
296 | self.__offset -= self.__limit
297 | return True
298 | return False
299 |
300 | def render(self, stdscr: curses.window) -> None:
301 | """
302 | Render the search form with multi-column sites layout.
303 | """
304 | xb: int = self.bounds.x
305 | yb: int = self.bounds.y
306 | y_current: int = yb + 2 # y start
307 | y_max: int = yb + self.bounds.height
308 |
309 | if not self._renderable(stdscr):
310 | return
311 |
312 | safe_addstr(stdscr, y_current, xb + 2, "Query:")
313 |
314 | box_width = min(LAYOUT_QUERY_MAX_WIDTH, self.bounds.width - LAYOUT_QUERY_MARGIN)
315 | is_query_selected = (self._focused and self._selected_index == 0)
316 |
317 | self.__query_input.render(stdscr, y_current, xb + LAYOUT_QUERY_OFFSET, box_width,
318 | focused=is_query_selected, style=self._get_input_style())
319 |
320 | y_current += 2
321 | if y_current >= y_max:
322 | return
323 |
324 | # radio column layout - calculated positions based on content width
325 | filter_column_width = self.__filter_group.calculate_group_width() + LAYOUT_FILTER_COLUMN_PADDING
326 | sort_column_width = self.__sort_group.calculate_group_width() + LAYOUT_SORT_COLUMN_PADDING
327 | sort_start_x = filter_column_width + LAYOUT_FILTER_TO_SORT_SPACING
328 | sites_start_x = sort_start_x + sort_column_width + LAYOUT_SORT_TO_SITES_SPACING
329 |
330 | safe_addstr(stdscr, y_current, xb + 2, self.__filter_group.label)
331 | safe_addstr(stdscr, y_current, xb + sort_start_x, self.__sort_group.label)
332 | if sites_start_x + LAYOUT_SITES_MIN_WIDTH_REQUIREMENT < self.bounds.width:
333 | safe_addstr(stdscr, y_current, xb + sites_start_x, self.__sites_group.label)
334 | if not self.__sites:
335 | error_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
336 | safe_addstr(stdscr, y_current + 1, xb + sites_start_x, "No sites available", error_style)
337 |
338 | y_current += 1
339 |
340 | available_width = self.bounds.width - sites_start_x - 4
341 | is_constrained = self.session.ui_state == UiState.SEARCH_RESULTS
342 | sites_per_column = (LAYOUT_CONSTRAINED_SITES_PER_COLUMN if is_constrained
343 | else min(self.bounds.height - LAYOUT_SITES_VERTICAL_OFFSET, len(self.__sites_group.radios)))
344 | max_columns = (max(1, available_width // (LAYOUT_SITE_COLUMN_WIDTH + LAYOUT_SITE_COLUMN_SPACING))
345 | if available_width > LAYOUT_SITE_COLUMN_WIDTH else 1)
346 | total_visible_sites = max_columns * sites_per_column
347 | overflow_count = max(0, len(self.__sites_group.radios) - total_visible_sites)
348 | max_rows = max(len(self.__filter_group.radios), len(self.__sort_group.radios), sites_per_column)
349 |
350 | for i in range(max_rows):
351 |
352 | if y_current >= y_max:
353 | return
354 |
355 | # filter radios
356 | if i < len(self.__filter_group.radios):
357 | filter_radio: InputRadio = self.__filter_group.radios[i]
358 | field_index: int = 1 + i
359 | is_selected: bool = self._selected_index == field_index
360 | filter_radio.render(stdscr, y_current, xb + 2, field_index, 100, is_selected)
361 |
362 | # sorts radios
363 | if i < len(self.__sort_group.radios):
364 | sort_radio: InputRadio = self.__sort_group.radios[i]
365 | field_index: int = 1 + len(self.__filter_group.radios) + i
366 | is_selected: bool = self._selected_index == field_index
367 | sort_radio.render(stdscr, y_current, xb + sort_start_x, field_index, 100, is_selected)
368 |
369 | # sites radios - multiple columns
370 | if sites_start_x + LAYOUT_SITES_MIN_WIDTH_REQUIREMENT < self.bounds.width:
371 | for col in range(max_columns):
372 | site_index = col * sites_per_column + i
373 | if site_index < len(self.__sites_group.radios) and site_index < total_visible_sites:
374 | site_radio: InputRadio = self.__sites_group.radios[site_index]
375 | field_index: int = 1 + len(self.__sort_group.radios) + len(self.__filter_group.radios) + site_index
376 | is_selected: bool = self._selected_index == field_index
377 | col_x = sites_start_x + col * (LAYOUT_SITE_COLUMN_WIDTH + LAYOUT_SITE_COLUMN_SPACING)
378 | original_label = site_radio.label
379 | site_radio.label = self.__truncate_label(original_label)
380 | site_radio.render(stdscr, y_current, xb + col_x, field_index, LAYOUT_TRUNCATED_LABEL_MAX_LENGTH, is_selected)
381 | site_radio.label = original_label # restore original label
382 |
383 | # overflow indicator on last row, right-aligned
384 | if (overflow_count > 0 and i == sites_per_column - 1 and
385 | sites_start_x + LAYOUT_SITES_MIN_WIDTH_REQUIREMENT < self.bounds.width):
386 | overflow_text: str = f"+{overflow_count} more"
387 | overflow_x: int = self.bounds.width - len(overflow_text) - LAYOUT_OVERFLOW_INDICATOR_MARGIN
388 | try:
389 | safe_addstr(stdscr, y_current, overflow_x, overflow_text, curses.A_DIM)
390 | except curses.error:
391 | pass
392 |
393 | y_current += 1
394 |
395 | def set_search_attempted(self) -> None:
396 | """
397 | Set attempted to True.
398 | """
399 | self.__search_attempted = True
400 |
401 | def unfocus(self):
402 | """
403 | Remove focus from this view.
404 | """
405 | self._focused = False
406 |
407 | def __get_sites_per_column(self) -> int:
408 | """
409 | Handle left arrow key navigation.
410 | """
411 | is_constrained = self.session.ui_state == UiState.SEARCH_RESULTS
412 | return (LAYOUT_CONSTRAINED_SITES_PER_COLUMN if is_constrained
413 | else min(self.bounds.height - LAYOUT_SITES_VERTICAL_OFFSET, len(self.__sites_group.radios)))
414 |
415 | def __handle_enter(self) -> None:
416 | """
417 | Handle ENTER key - only toggles radio buttons, doesn't affect query field.
418 | """
419 |
420 | if self._selected_index == 0: # query field
421 | self.session.searchman.autosearch()
422 | else: # radios
423 | self.__handle_radio_toggle()
424 | if self.session.ui_state != UiState.SEARCH_INIT:
425 | self.session.searchman.autosearch(immediate=True)
426 |
427 | def __handle_horizontal_arrow(self, direction: NavigationDirection) -> None:
428 | """
429 | Handle left/right arrow navigation using the directional grid.
430 |
431 | Args:
432 | direction: The navigation direction (LEFT or RIGHT)
433 | """
434 | if self.session.ui_state is None:
435 | return
436 |
437 | # query field handles cursor movement internally
438 | if self._selected_index == 0:
439 | if direction == NavigationDirection.LEFT:
440 | self.__query_input.move_cursor_left()
441 | else:
442 | self.__query_input.move_cursor_right()
443 | return
444 |
445 | # use grid navigation for all other fields
446 | grid = SearchFormNavigationGrid(self.session.ui_state, self.__filter_group, self.__sort_group,
447 | self.__sites_group, self.__get_sites_per_column())
448 | if direction == NavigationDirection.LEFT:
449 | new_index = grid.left(self._selected_index)
450 | else:
451 | new_index = grid.right(self._selected_index)
452 |
453 | if new_index is not None:
454 | self._selected_index = new_index
455 |
456 | def __handle_radio_toggle(self) -> None:
457 | """
458 | Handle radio button toggles for filters and sites.
459 | """
460 | filter_index_start: int = 1
461 | sorts_index_start: int = filter_index_start + len(self.__filter_group.radios)
462 | sites_index_start: int = sorts_index_start + len(self.__sort_group.radios)
463 |
464 | if self._selected_index >= filter_index_start and self._selected_index < sorts_index_start:
465 | filter_index = self._selected_index - filter_index_start
466 | filter_input: InputRadio = self.__filter_group.radios[filter_index]
467 | filter_input.next_state()
468 | elif self._selected_index >= sorts_index_start and self._selected_index < sites_index_start:
469 | sort_index = self._selected_index - sorts_index_start
470 | sort_input: InputRadio = self.__sort_group.radios[sort_index]
471 | sort_input.next_state()
472 | elif self._selected_index >= sites_index_start:
473 | site_index = self._selected_index - sites_index_start
474 | if site_index < len(self.__sites) and site_index < len(self.__sites_group.radios):
475 | site_input: InputRadio = self.__sites_group.radios[site_index]
476 | site_input.next_state()
477 | self.__sites_selected = [self.__sites[site_index]]
478 |
479 | def __handle_spacebar(self) -> None:
480 | """
481 | Handle spacebar for toggles. Updated for new field order: Query, Filters, Sites.
482 | """
483 | if self._selected_index == 0: # query field
484 | self.__query_input.insert_char(" ")
485 | self.session.searchman.autosearch()
486 | else: # radios
487 | self.__handle_radio_toggle()
488 | if self.session.ui_state != UiState.SEARCH_INIT:
489 | self.session.searchman.autosearch()
490 |
491 | def __navigate_form_selection(self, direction: NavigationDirection) -> None:
492 | """
493 | Navigate between form fields. Updated for new field order: Query, Filters, Sites.
494 |
495 | Args:
496 | direction: The navigation direction (UP or DOWN)
497 | """
498 | # query(0), filters(1-2), sorts(3-5), sites(...)
499 | last_field_index = 5 + len(self.__sites)
500 | if direction == NavigationDirection.UP:
501 | if self._selected_index == 0:
502 | self._selected_index = last_field_index
503 | else:
504 | self._selected_index -= 1
505 | elif direction == NavigationDirection.DOWN:
506 | if self._selected_index == last_field_index:
507 | self._selected_index = 0
508 | else:
509 | self._selected_index += 1
510 |
511 | def __truncate_label(self, label: str, max_length: int = LAYOUT_TRUNCATED_LABEL_MAX_LENGTH) -> str:
512 | """
513 | Truncate label to max_length, replacing last char with ellipsis if needed.
514 |
515 | Args:
516 | label: The label text to truncate
517 | max_length: Maximum allowed length for the label
518 |
519 | Returns:
520 | str: The truncated label with ellipsis if needed
521 | """
522 | if len(label) <= max_length:
523 | return label
524 | return label[:max_length - 1] + "…"
525 |
```
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/extras/regex.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.extras.regex — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
49 | </ul>
50 |
51 | </div>
52 | </div>
53 | </nav>
54 |
55 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
56 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
57 | <a href="../../../index.html">mcp-server-webcrawl</a>
58 | </nav>
59 |
60 | <div class="wy-nav-content">
61 | <div class="rst-content">
62 | <div role="navigation" aria-label="Page navigation">
63 | <ul class="wy-breadcrumbs">
64 | <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
65 | <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
66 | <li class="breadcrumb-item active">mcp_server_webcrawl.extras.regex</li>
67 | <li class="wy-breadcrumbs-aside">
68 | </li>
69 | </ul>
70 | <hr/>
71 | </div>
72 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
73 | <div itemprop="articleBody">
74 |
75 | <h1>Source code for mcp_server_webcrawl.extras.regex</h1><div class="highlight"><pre>
76 | <span></span><span class="kn">import</span> <span class="nn">re</span>
77 |
78 | <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">lru_cache</span>
79 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
80 | <span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
81 |
82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
83 |
84 | <span class="n">__REGEX_PATTERNS_REGEX_HAZARDS</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[</span>
85 | <span class="sa">r</span><span class="s2">"\([^)]*\*[^)]*\+"</span><span class="p">,</span> <span class="c1"># (.*)*+, (.+)*+, etc.</span>
86 | <span class="sa">r</span><span class="s2">"\([^)]*\+[^)]*\*"</span><span class="p">,</span> <span class="c1"># (.+)*., (.*)++, etc.</span>
87 | <span class="sa">r</span><span class="s2">"\([^)]*\+[^)]*\+"</span><span class="p">,</span> <span class="c1"># (.+)+, (.++)+ etc.</span>
88 | <span class="sa">r</span><span class="s2">"\([^)]*\*[^)]*\*"</span><span class="p">,</span> <span class="c1"># (.*)*, (.**) etc.</span>
89 | <span class="sa">r</span><span class="s2">"\.\*.*\.\*"</span><span class="p">,</span> <span class="c1"># .*.* patterns</span>
90 | <span class="sa">r</span><span class="s2">"\.\+.*\.\+"</span><span class="p">,</span> <span class="c1"># .+.+ patterns</span>
91 | <span class="sa">r</span><span class="s2">"\([^)]*\?\)\*"</span><span class="p">,</span> <span class="c1"># (a?)* patterns</span>
92 | <span class="sa">r</span><span class="s2">"\([^)]*\?\)\+"</span><span class="p">,</span> <span class="c1"># (a?)+ patterns</span>
93 | <span class="sa">r</span><span class="s2">"\([^)]*[*+?][^)]*[*+?][^)]*\)[*+]"</span><span class="p">,</span> <span class="c1"># 2+ quantifiers inside, then quantifier outside</span>
94 | <span class="p">]</span>
95 |
96 | <span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
97 |
98 | <span class="nd">@lru_cache</span><span class="p">(</span><span class="n">maxsize</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
99 | <span class="k">def</span> <span class="nf">__get_compiled_hazard_patterns</span><span class="p">():</span>
100 | <span class="w"> </span><span class="sd">"""</span>
101 | <span class="sd"> Lazy load compiled patterns</span>
102 | <span class="sd"> """</span>
103 | <span class="n">compiled_patterns</span> <span class="o">=</span> <span class="p">[]</span>
104 | <span class="k">for</span> <span class="n">hazard</span> <span class="ow">in</span> <span class="n">__REGEX_PATTERNS_REGEX_HAZARDS</span><span class="p">:</span>
105 | <span class="k">try</span><span class="p">:</span>
106 | <span class="n">compiled_patterns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">hazard</span><span class="p">))</span>
107 | <span class="k">except</span> <span class="n">re</span><span class="o">.</span><span class="n">error</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
108 | <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Invalid hazard pattern </span><span class="si">{</span><span class="n">hazard</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
109 | <span class="k">continue</span>
110 | <span class="k">return</span> <span class="n">compiled_patterns</span>
111 |
112 | <span class="k">def</span> <span class="nf">__regex_is_hazardous</span><span class="p">(</span><span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
113 | <span class="w"> </span><span class="sd">"""</span>
114 | <span class="sd"> Check if a regex pattern might cause catastrophic backtracking</span>
115 | <span class="sd"> or otherwise unacceptable performance over up to 100 HTML files</span>
116 | <span class="sd"> """</span>
117 |
118 | <span class="n">compiled_hazards</span> <span class="o">=</span> <span class="n">__get_compiled_hazard_patterns</span><span class="p">()</span>
119 |
120 | <span class="k">for</span> <span class="n">hazard_pattern</span> <span class="ow">in</span> <span class="n">compiled_hazards</span><span class="p">:</span>
121 | <span class="k">try</span><span class="p">:</span>
122 | <span class="k">if</span> <span class="n">hazard_pattern</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">pattern</span><span class="p">):</span>
123 | <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="sa">f</span><span class="s2">"hazardous regex discarded </span><span class="si">{</span><span class="n">pattern</span><span class="si">}</span><span class="s2"> matched </span><span class="si">{</span><span class="n">hazard_pattern</span><span class="o">.</span><span class="n">pattern</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
124 | <span class="k">return</span> <span class="kc">True</span>
125 | <span class="k">except</span> <span class="n">re</span><span class="o">.</span><span class="n">error</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
126 | <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Error checking hazard pattern </span><span class="si">{</span><span class="n">hazard_pattern</span><span class="o">.</span><span class="n">pattern</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
127 | <span class="k">continue</span>
128 |
129 | <span class="k">return</span> <span class="kc">False</span>
130 |
131 | <div class="viewcode-block" id="get_regex">
132 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.regex.get_regex">[docs]</a>
133 | <span class="k">def</span> <span class="nf">get_regex</span><span class="p">(</span><span class="n">headers</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">content</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">patterns</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span><span class="p">]]:</span>
134 | <span class="w"> </span><span class="sd">"""</span>
135 | <span class="sd"> Takes headers and content and gets regex matches</span>
136 |
137 | <span class="sd"> Arguments:</span>
138 | <span class="sd"> headers: The headers to search</span>
139 | <span class="sd"> content: The content to search</span>
140 | <span class="sd"> patterns: The regex patterns</span>
141 |
142 | <span class="sd"> Returns:</span>
143 | <span class="sd"> A list of dicts, with selector, value, groups, position info, and source</span>
144 | <span class="sd"> """</span>
145 |
146 | <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">content</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
147 | <span class="n">content</span> <span class="o">=</span> <span class="s2">""</span>
148 | <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">headers</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
149 | <span class="n">headers</span> <span class="o">=</span> <span class="s2">""</span>
150 |
151 | <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">patterns</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">patterns</span><span class="p">):</span>
152 | <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"patterns must be a list of strings"</span><span class="p">)</span>
153 |
154 | <span class="n">results</span> <span class="o">=</span> <span class="p">[]</span>
155 |
156 | <span class="k">if</span> <span class="n">content</span> <span class="o">==</span> <span class="s2">""</span> <span class="ow">and</span> <span class="n">headers</span> <span class="o">==</span> <span class="s2">""</span><span class="p">:</span>
157 | <span class="k">return</span> <span class="n">results</span>
158 |
159 | <span class="n">re_patterns</span> <span class="o">=</span> <span class="p">[]</span>
160 | <span class="k">for</span> <span class="n">pattern</span> <span class="ow">in</span> <span class="n">patterns</span><span class="p">:</span>
161 | <span class="k">if</span> <span class="n">__regex_is_hazardous</span><span class="p">(</span><span class="n">pattern</span><span class="p">):</span>
162 | <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Hazardous regex pattern '</span><span class="si">{</span><span class="n">pattern</span><span class="si">}</span><span class="s2">'"</span><span class="p">)</span>
163 | <span class="k">continue</span>
164 |
165 | <span class="k">try</span><span class="p">:</span>
166 | <span class="n">re_pattern</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">pattern</span><span class="p">)</span>
167 | <span class="n">re_patterns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">re_pattern</span><span class="p">)</span>
168 | <span class="k">except</span> <span class="n">re</span><span class="o">.</span><span class="n">error</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
169 | <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Invalid regex pattern '</span><span class="si">{</span><span class="n">pattern</span><span class="si">}</span><span class="s2">': </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
170 | <span class="k">continue</span>
171 |
172 | <span class="c1"># search headers and content</span>
173 | <span class="n">search_targets</span> <span class="o">=</span> <span class="p">[(</span><span class="s2">"headers"</span><span class="p">,</span> <span class="n">headers</span><span class="p">),</span> <span class="p">(</span><span class="s2">"content"</span><span class="p">,</span> <span class="n">content</span><span class="p">)]</span>
174 |
175 | <span class="k">for</span> <span class="n">re_pattern</span> <span class="ow">in</span> <span class="n">re_patterns</span><span class="p">:</span>
176 | <span class="k">for</span> <span class="n">source_name</span><span class="p">,</span> <span class="n">search_text</span> <span class="ow">in</span> <span class="n">search_targets</span><span class="p">:</span>
177 | <span class="k">if</span> <span class="ow">not</span> <span class="n">search_text</span><span class="p">:</span>
178 | <span class="k">continue</span>
179 |
180 | <span class="k">for</span> <span class="n">match</span> <span class="ow">in</span> <span class="n">re_pattern</span><span class="o">.</span><span class="n">finditer</span><span class="p">(</span><span class="n">search_text</span><span class="p">):</span>
181 | <span class="n">regex_hit</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
182 | <span class="s2">"selector"</span><span class="p">:</span> <span class="n">re_pattern</span><span class="o">.</span><span class="n">pattern</span><span class="p">,</span>
183 | <span class="s2">"value"</span><span class="p">:</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span>
184 | <span class="s2">"source"</span><span class="p">:</span> <span class="n">source_name</span> <span class="c1"># headers or content</span>
185 | <span class="p">}</span>
186 |
187 | <span class="k">if</span> <span class="n">match</span><span class="o">.</span><span class="n">groups</span><span class="p">():</span>
188 | <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">group</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">groups</span><span class="p">(),</span> <span class="mi">1</span><span class="p">):</span>
189 | <span class="k">if</span> <span class="n">group</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
190 | <span class="n">regex_hit</span><span class="p">[</span><span class="sa">f</span><span class="s2">"group_</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s2">"</span><span class="p">]</span> <span class="o">=</span> <span class="n">group</span>
191 |
192 | <span class="n">regex_hit</span><span class="p">[</span><span class="s2">"start"</span><span class="p">]</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">start</span><span class="p">()</span>
193 | <span class="n">regex_hit</span><span class="p">[</span><span class="s2">"end"</span><span class="p">]</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">end</span><span class="p">()</span>
194 | <span class="n">results</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">regex_hit</span><span class="p">)</span>
195 |
196 | <span class="k">return</span> <span class="n">results</span></div>
197 |
198 | </pre></div>
199 |
200 | </div>
201 | </div>
202 | <footer>
203 |
204 | <hr/>
205 |
206 | <div role="contentinfo">
207 | <p>© Copyright 2025, pragmar.</p>
208 | </div>
209 |
210 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
211 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
212 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
213 |
214 |
215 | </footer>
216 | </div>
217 | </div>
218 | </section>
219 | </div>
220 | <script>
221 | jQuery(function () {
222 | SphinxRtdTheme.Navigation.enable(true);
223 | });
224 | </script>
225 |
226 | </body>
227 | </html>
```
--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/adapter.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import os
3 | import sqlite3
4 |
5 | from contextlib import closing
6 | from datetime import datetime, timezone
7 | from pathlib import Path
8 |
9 | from mcp_server_webcrawl.crawlers.base.adapter import (
10 | BaseManager,
11 | IndexState,
12 | IndexStatus,
13 | SitesGroup,
14 | INDEXED_BATCH_SIZE,
15 | INDEXED_TYPE_MAPPING,
16 | INDEXED_IGNORE_DIRECTORIES,
17 | )
18 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
19 | from mcp_server_webcrawl.models.resources import (
20 | ResourceResult,
21 | ResourceResultType,
22 | RESOURCES_LIMIT_DEFAULT,
23 | )
24 | from mcp_server_webcrawl.models.sites import (
25 | SiteResult,
26 | SiteType,
27 | SITES_FIELDS_BASE,
28 | SITES_FIELDS_DEFAULT,
29 | )
30 | from mcp_server_webcrawl.utils.logger import get_logger
31 |
32 | # skip metadata directories
33 | ARCHIVEBOX_SKIP_DIRECTORIES: set[str] = {"media", "mercury"}
34 | ARCHIVEBOX_COLLAPSE_FILENAMES: list[str] = ["/index.html", "/index.htm"]
35 |
36 | logger = get_logger()
37 |
38 | class ArchiveBoxManager(IndexedManager):
39 | """
40 | Manages ArchiveBox in-memory SQLite databases for session-level reuse.
41 | """
42 |
43 | def __init__(self) -> None:
44 | """
45 | Initialize the ArchiveBox manager with empty cache and statistics.
46 | """
47 | super().__init__()
48 |
49 | def _load_site_data(self, connection: sqlite3.Connection, site_directory: Path,
50 | site_id: int, index_state: IndexState = None) -> None:
51 | """
52 | Load ArchiveBox site data into the database.
53 |
54 | Args:
55 | connection: SQLite connection
56 | site_directory: path to the ArchiveBox site directory (e.g., "example" or "pragmar")
57 | site_id: ID for the site
58 | index_state: IndexState object for tracking progress
59 | """
60 | # The site_directory should be something like "example" or "pragmar"
61 | # We need to look for the "archive" subdirectory within it
62 | archive_directory: Path = site_directory / "archive"
63 |
64 | if not archive_directory.exists() or not archive_directory.is_dir():
65 | logger.error(f"Archive directory not found in site: {archive_directory}")
66 | return
67 |
68 | if index_state is not None:
69 | index_state.set_status(IndexStatus.INDEXING)
70 |
71 | # page directories are timestamped (e.g. example/archive/1756357684.13023)
72 | # these contiain page data/media
73 | page_directories = self._get_page_directories(archive_directory)
74 | if not page_directories:
75 | logger.warning(f"No timestamped entries found in archive: {archive_directory}")
76 | return
77 |
78 | all_resources: list[ResourceResult] = []
79 |
80 | # process each timestamped entry
81 | for page_directory in page_directories:
82 |
83 | if index_state is not None and index_state.is_timeout():
84 | index_state.set_status(IndexStatus.PARTIAL)
85 | break
86 |
87 | try:
88 | metadata = self._get_page_metadata(page_directory)
89 | main_url: str = metadata["url"] if "url" in metadata else \
90 | f"archivebox://unknown/{page_directory.name}"
91 |
92 | # primary resource
93 | main_resource = self._create_page_resource(page_directory, site_id, main_url, metadata)
94 | if main_resource:
95 | all_resources.append(main_resource)
96 | if index_state is not None:
97 | index_state.increment_processed()
98 |
99 | # collect assets (external js/css/fonts/whatever)
100 | domain_assets = self._get_page_domain_assets(page_directory, main_url)
101 | for file_path, asset_url in domain_assets:
102 | asset_resource = self._create_asset_resource(file_path, site_id, asset_url, page_directory)
103 | if asset_resource:
104 | all_resources.append(asset_resource)
105 | if index_state is not None:
106 | index_state.increment_processed()
107 |
108 | except Exception as ex:
109 | logger.error(f"Error processing entry {page_directory}: {ex}")
110 |
111 | deduplicated_resources = self._dedupe_resources(all_resources)
112 | with closing(connection.cursor()) as cursor:
113 | for i in range(0, len(deduplicated_resources), INDEXED_BATCH_SIZE):
114 | batch = deduplicated_resources[i:i+INDEXED_BATCH_SIZE]
115 | self._execute_batch_insert(connection, cursor, batch)
116 |
117 | if index_state is not None and index_state.status == IndexStatus.INDEXING:
118 | index_state.set_status(IndexStatus.COMPLETE)
119 |
120 | def _create_page_resource(self, resource_directory: Path, site_id: int,
121 | url: str, metadata: dict) -> ResourceResult | None:
122 | """
123 | Create ResourceResult for the main captured page.
124 | """
125 | try:
126 |
127 | # created/modified is directory stat
128 | resource_stat: os.stat_result = resource_directory.stat()
129 | created: datetime = datetime.fromtimestamp(resource_stat.st_ctime, tz=timezone.utc)
130 | modified: datetime = datetime.fromtimestamp(resource_stat.st_mtime, tz=timezone.utc)
131 |
132 | # select best content, with appropriate fallbacks
133 | html_file: Path = None
134 | if "canonical" in metadata:
135 | # dom first, wget second, ignore singlefile (datauris generate too much storage)
136 | canonical: dict[str, str] = metadata["canonical"]
137 | prioritized_paths = ["dom_path", "wget_path"]
138 | for path_key in prioritized_paths:
139 | if path_key in canonical and canonical[path_key] is not None:
140 | candidate_file = resource_directory / canonical[path_key]
141 | if candidate_file.resolve().is_relative_to(resource_directory.resolve()) and candidate_file.exists():
142 | html_file = candidate_file
143 | break
144 |
145 | # fallback to ArchiveBox index file (metadata file - barely useful, but dependable)
146 | if html_file is None:
147 | html_file = resource_directory / "index.html"
148 |
149 | # read content
150 | content: str|None = None
151 | file_size: int = 0
152 | if html_file.exists():
153 | try:
154 | with open(html_file, "r", encoding="utf-8", errors="replace") as f:
155 | content = f.read()
156 | file_size: int = html_file.stat().st_size
157 | except Exception as ex:
158 | logger.warning(f"Could not read HTML from {html_file}: {ex}")
159 |
160 | # assemble metadata
161 | status_code: int = 200
162 | headers_reconstructed: str = ""
163 | if "http_headers" in metadata:
164 | http_headers = metadata["http_headers"]
165 | if "status" in http_headers:
166 | try:
167 | status_code = int(str(http_headers["status"]).split()[0])
168 | except (ValueError, IndexError):
169 | pass
170 | headers_reconstructed = self._get_http_headers_string(http_headers)
171 |
172 | if not headers_reconstructed:
173 | headers_reconstructed = BaseManager.get_basic_headers(
174 | file_size, ResourceResultType.PAGE)
175 |
176 | return ResourceResult(
177 | id=BaseManager.string_to_id(url),
178 | site=site_id,
179 | created=created,
180 | modified=modified,
181 | url=url,
182 | type=ResourceResultType.PAGE,
183 | status=status_code,
184 | headers=headers_reconstructed,
185 | content=content,
186 | size=file_size,
187 | time=0
188 | )
189 |
190 | except Exception as ex:
191 | logger.error(f"Error creating main resource for {resource_directory}: {ex}")
192 | return None
193 |
194 | def _create_asset_resource(self, file_path: Path, site_id: int, url: str, entry_dir: Path) -> ResourceResult | None:
195 | """
196 | Create ResourceResult for a domain asset file.
197 | """
198 | try:
199 | # get file info
200 | if not file_path.exists():
201 | return None
202 |
203 | file_stat = file_path.stat()
204 | created: datetime = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
205 | modified: datetime = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
206 | file_size: int = file_stat.st_size
207 | extension: str = file_path.suffix.lower()
208 |
209 | # ArchiveBox will stuff URL args into @... in the filename
210 | # sometimes it's the filename, sometimes the extension
211 | # both need cleaning
212 | clean_url: str = url.split("@")[0]
213 | clean_extension: str = extension.split("@")[0]
214 | resource_type: str = INDEXED_TYPE_MAPPING.get(clean_extension, ResourceResultType.OTHER)
215 |
216 | # read content for text files
217 | content: str | None = BaseManager.read_file_contents(file_path, resource_type)
218 |
219 | return ResourceResult(
220 | id=BaseManager.string_to_id(clean_url),
221 | site=site_id,
222 | created=created,
223 | modified=modified,
224 | url=clean_url,
225 | type=resource_type,
226 | status=200, # assume assets successful
227 | headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
228 | content=content,
229 | size=file_size,
230 | time=0
231 | )
232 |
233 | except Exception as ex:
234 | logger.error(f"Error creating asset resource for {file_path}: {ex}")
235 | return None
236 |
237 | def _get_page_directories(self, archive_directory: Path) -> list[Path]:
238 | """
239 | Get webpage directories within ArchiveBox archive.
240 |
241 | Args:
242 | archive_directory: path to the ArchiveBox archive directory
243 |
244 | Returns:
245 | List of timestamped entry directory paths
246 | """
247 |
248 | # page_directories are the timestamped directories,
249 | # e.g. archive/1756342555.086082
250 | page_directories = []
251 |
252 | if not archive_directory.is_dir():
253 | return page_directories
254 |
255 | for item in archive_directory.iterdir():
256 | # 1756342555.086082.replace(".", "") is numeric
257 | if (item.is_dir() and item.name.replace(".", "").isdigit()):
258 | data_files: list[Path] = [
259 | (item / "index.json"),
260 | (item / "headers.json"),
261 | (item / "index.html"),
262 | ]
263 | for data_file in data_files:
264 | if data_file.exists():
265 | page_directories.append(item)
266 | break
267 |
268 | return sorted(page_directories)
269 |
270 | def _get_page_metadata(self, entry_directory: Path) -> dict:
271 | """
272 | Extract metadata from ArchiveBox entry files.
273 |
274 | Args:
275 | entry_directory: path to the timestamped entry directory
276 |
277 | Returns:
278 | Dictionary containing extracted metadata
279 | """
280 | page_metadata: dict[str, str] = {}
281 |
282 | # read index.json for primary URL and metadata
283 | index_json_path: Path = entry_directory / "index.json"
284 | if index_json_path.exists():
285 | try:
286 | with open(index_json_path, "r", encoding="utf-8", errors="replace") as f:
287 | index_data = json.load(f)
288 | page_metadata.update(index_data)
289 | except (json.JSONDecodeError, UnicodeDecodeError) as ex:
290 | logger.warning(f"Could not parse index.json from {entry_directory}: {ex}")
291 | except Exception as ex:
292 | logger.error(f"Error reading index.json from {entry_directory}: {ex}")
293 |
294 | # read headers.json for HTTP headers
295 | headers_json_path = entry_directory / "headers.json"
296 | if headers_json_path.exists():
297 | try:
298 | with open(headers_json_path, "r", encoding="utf-8", errors="replace") as f:
299 | http_headers = json.load(f)
300 | page_metadata["http_headers"] = http_headers
301 | except (json.JSONDecodeError, UnicodeDecodeError) as ex:
302 | logger.warning(f"Could not parse headers.json from {entry_directory}: {ex}")
303 | except Exception as ex:
304 | logger.error(f"Error reading headers.json from {entry_directory}: {ex}")
305 |
306 | return page_metadata
307 |
308 | def _get_page_domain_assets(self, entry_dir: Path, main_url: str) -> list[tuple[Path, str]]:
309 | """
310 | Collect all domain asset files within an entry.
311 |
312 | Args:
313 | entry_dir: path to the timestamped entry
314 | main_url: the main captured URL
315 |
316 | Returns:
317 | List of (file_path, reconstructed_url) tuples
318 | """
319 | assets: list[tuple] = []
320 |
321 |
322 |
323 | for item in entry_dir.iterdir():
324 | if item.is_dir() and item.name not in ARCHIVEBOX_SKIP_DIRECTORIES:
325 | # this is an archivebox domain directory
326 | domain_name: str = item.name
327 |
328 | # walk domain directories for assets
329 | # (e.g. example/archive/1756357684.13023/example.com)
330 | for root, _, files in os.walk(item):
331 | for filename in files:
332 |
333 | # *orig$ are dupes, not reliably in fileext form
334 | if filename.endswith("orig"):
335 | continue
336 |
337 | file_path = Path(root) / filename
338 |
339 | # clean up ArchiveBox's @timestamp suffixes for URL construction
340 | clean_filename: str = filename.split("@")[0]
341 | clean_file_path: Path = Path(root) / clean_filename
342 | relative_path = clean_file_path.relative_to(item)
343 | url = f"https://{domain_name}/{str(relative_path).replace(os.sep, '/')}"
344 | for collapse_filename in ARCHIVEBOX_COLLAPSE_FILENAMES:
345 | # turn ./index.html and variants into ./ (dir index) to help the indexer
346 | if url.endswith(collapse_filename):
347 | url = url[:-(len(collapse_filename))] + "/"
348 | break
349 |
350 | # Use original file_path for reading, clean url for storage
351 | assets.append((file_path, url))
352 |
353 | return assets
354 |
355 | def _dedupe_resources(self, resources: list[ResourceResult]) -> list[ResourceResult]:
356 | """
357 | Deduplicate resources based on URL and metadata
358 |
359 | Args:
360 | resources: list of ResourceResult objects
361 |
362 | Returns:
363 | Deduplicated list of ResourceResult objects
364 | """
365 | seen_urls: dict[str, ResourceResult] = {}
366 | deduplicated: list[ResourceResult] = []
367 | resource: ResourceResult
368 | for resource in resources:
369 | if resource.url in seen_urls:
370 | # url collision, check if content differs, prefer newer
371 | existing = seen_urls[resource.url]
372 | if resource.modified and existing.modified:
373 | if resource.modified > existing.modified:
374 | deduplicated = [r for r in deduplicated if r.url != resource.url]
375 | deduplicated.append(resource)
376 | seen_urls[resource.url] = resource
377 | else:
378 | # keep existing
379 | seen_urls[resource.url] = resource
380 | deduplicated.append(resource)
381 |
382 | return deduplicated
383 |
384 | def _get_http_headers_string(self, http_headers: dict) -> str:
385 | """
386 | Format headers dictionary as HTTP headers string.
387 | """
388 | if not http_headers:
389 | return ""
390 |
391 | headers_lines: list[str] = []
392 | status: int = http_headers.get("Status-Code", 200)
393 | headers_lines.append(f"HTTP/1.0 {status}")
394 |
395 | for key, value in http_headers.items():
396 | if key.lower() not in ["status-code"]:
397 | headers_lines.append(f"{key}: {value}")
398 |
399 | return "\r\n".join(headers_lines) + "\r\n\r\n"
400 |
401 |
402 | manager: ArchiveBoxManager = ArchiveBoxManager()
403 |
404 | def get_sites(
405 | datasrc: Path,
406 | ids: list[int] | None = None,
407 | fields: list[str] | None = None
408 | ) -> list[SiteResult]:
409 | """
410 | List ArchiveBox instances as separate sites.
411 | Each subdirectory of datasrc that contains an "archive" folder is treated as a separate ArchiveBox instance.
412 |
413 | Args:
414 | datasrc: path to the directory containing ArchiveBox instance directories
415 | ids: optional list of site IDs to filter by
416 | fields: optional list of fields to include in the response
417 |
418 | Returns:
419 | List of SiteResult objects, one for each ArchiveBox instance
420 | """
421 | assert datasrc is not None, f"datasrc not provided ({datasrc})"
422 |
423 | if not datasrc.exists():
424 | logger.error(f"Directory not found ({datasrc})")
425 | return []
426 |
427 | # determine which fields to include
428 | selected_fields: set[str] = set(SITES_FIELDS_BASE)
429 | if fields:
430 | valid_fields: set[str] = set(SITES_FIELDS_DEFAULT)
431 | selected_fields.update(f for f in fields if f in valid_fields)
432 | else:
433 | selected_fields.update(SITES_FIELDS_DEFAULT)
434 |
435 | results: list[SiteResult] = []
436 |
437 | # get all directories that contain an "archive" subdirectory
438 | site_directories: list[Path] = []
439 | for datasrc_item in datasrc.iterdir():
440 | if (
441 | datasrc_item.is_dir() and
442 | not datasrc_item.name.startswith(".") and
443 | datasrc_item.name not in INDEXED_IGNORE_DIRECTORIES and
444 | (datasrc_item / "archive").is_dir()
445 | ):
446 | site_directories.append(datasrc_item)
447 |
448 | # map directory IDs to paths for filtering
449 | site_directories_map: dict[int, Path] = {BaseManager.string_to_id(d.name): d for d in site_directories}
450 |
451 | if ids:
452 | site_directories_map = {id_val: path for id_val, path in site_directories_map.items() if id_val in ids}
453 |
454 | # process each ArchiveBox instance directory
455 | for site_id, site_directory in sorted(site_directories_map.items()):
456 | site_directory_stat = site_directory.stat()
457 | created_time: datetime = datetime.fromtimestamp(site_directory_stat.st_ctime)
458 | modified_time: datetime = datetime.fromtimestamp(site_directory_stat.st_mtime)
459 |
460 | site = SiteResult(
461 | path=site_directory,
462 | id=site_id,
463 | name=site_directory.name, # NEW: the directory name
464 | type=SiteType.CRAWLED_LIST, # NEW: always CRAWLED_LIST for archivebox
465 | urls=[f"archivebox://{site_directory.name}/"], # CHANGED: now a list
466 | created=created_time if "created" in selected_fields else None,
467 | modified=modified_time if "modified" in selected_fields else None,
468 | )
469 |
470 | results.append(site)
471 |
472 | return results
473 |
474 | def get_resources(
475 | datasrc: Path,
476 | sites: list[int] | None = None,
477 | query: str = "",
478 | fields: list[str] | None = None,
479 | sort: str | None = None,
480 | limit: int = RESOURCES_LIMIT_DEFAULT,
481 | offset: int = 0,
482 | ) -> tuple[list[ResourceResult], int, IndexState]:
483 | """
484 | Get resources from ArchiveBox instances using in-memory SQLite.
485 |
486 | Args:
487 | datasrc: path to the directory containing ArchiveBox instance directories
488 | sites: optional list of site IDs to filter by
489 | query: search query string
490 | fields: optional list of fields to include in response
491 | sort: sort order for results
492 | limit: maximum number of results to return
493 | offset: number of results to skip for pagination
494 |
495 | Returns:
496 | Tuple of (list of ResourceResult objects, total count, IndexState)
497 | """
498 | sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
499 | assert sites_results, "At least one site is required to search"
500 |
501 | # use the actual site directories as paths (e.g., "example", "pragmar")
502 | site_paths = [site.path for site in sites_results]
503 | sites_group = SitesGroup(datasrc, sites or [site.id for site in sites_results], site_paths)
504 |
505 | return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
506 |
```