#
tokens: 45641/50000 7/216 files (page 8/35)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 8 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│   ├── _images
│   │   ├── interactive.document.webp
│   │   ├── interactive.search.webp
│   │   └── mcpswc.svg
│   ├── _modules
│   │   ├── index.html
│   │   ├── mcp_server_webcrawl
│   │   │   ├── crawlers
│   │   │   │   ├── archivebox
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── base
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── api.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   ├── indexed.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── httrack
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── interrobot
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── katana
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── siteone
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   ├── warc
│   │   │   │   │   ├── adapter.html
│   │   │   │   │   ├── crawler.html
│   │   │   │   │   └── tests.html
│   │   │   │   └── wget
│   │   │   │       ├── adapter.html
│   │   │   │       ├── crawler.html
│   │   │   │       └── tests.html
│   │   │   ├── crawlers.html
│   │   │   ├── extras
│   │   │   │   ├── markdown.html
│   │   │   │   ├── regex.html
│   │   │   │   ├── snippets.html
│   │   │   │   ├── thumbnails.html
│   │   │   │   └── xpath.html
│   │   │   ├── interactive
│   │   │   │   ├── highlights.html
│   │   │   │   ├── search.html
│   │   │   │   ├── session.html
│   │   │   │   └── ui.html
│   │   │   ├── main.html
│   │   │   ├── models
│   │   │   │   ├── resources.html
│   │   │   │   └── sites.html
│   │   │   ├── templates
│   │   │   │   └── tests.html
│   │   │   ├── utils
│   │   │   │   ├── blobs.html
│   │   │   │   ├── cli.html
│   │   │   │   ├── logger.html
│   │   │   │   ├── querycache.html
│   │   │   │   ├── server.html
│   │   │   │   └── tools.html
│   │   │   └── utils.html
│   │   └── re.html
│   ├── _sources
│   │   ├── guides
│   │   │   ├── archivebox.rst.txt
│   │   │   ├── httrack.rst.txt
│   │   │   ├── interrobot.rst.txt
│   │   │   ├── katana.rst.txt
│   │   │   ├── siteone.rst.txt
│   │   │   ├── warc.rst.txt
│   │   │   └── wget.rst.txt
│   │   ├── guides.rst.txt
│   │   ├── index.rst.txt
│   │   ├── installation.rst.txt
│   │   ├── interactive.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.base.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│   │   ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│   │   ├── mcp_server_webcrawl.extras.rst.txt
│   │   ├── mcp_server_webcrawl.interactive.rst.txt
│   │   ├── mcp_server_webcrawl.models.rst.txt
│   │   ├── mcp_server_webcrawl.rst.txt
│   │   ├── mcp_server_webcrawl.templates.rst.txt
│   │   ├── mcp_server_webcrawl.utils.rst.txt
│   │   ├── modules.rst.txt
│   │   ├── prompts.rst.txt
│   │   └── usage.rst.txt
│   ├── _static
│   │   ├── _sphinx_javascript_frameworks_compat.js
│   │   ├── basic.css
│   │   ├── css
│   │   │   ├── badge_only.css
│   │   │   ├── fonts
│   │   │   │   ├── fontawesome-webfont.eot
│   │   │   │   ├── fontawesome-webfont.svg
│   │   │   │   ├── fontawesome-webfont.ttf
│   │   │   │   ├── fontawesome-webfont.woff
│   │   │   │   ├── fontawesome-webfont.woff2
│   │   │   │   ├── lato-bold-italic.woff
│   │   │   │   ├── lato-bold-italic.woff2
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-normal-italic.woff
│   │   │   │   ├── lato-normal-italic.woff2
│   │   │   │   ├── lato-normal.woff
│   │   │   │   ├── lato-normal.woff2
│   │   │   │   ├── Roboto-Slab-Bold.woff
│   │   │   │   ├── Roboto-Slab-Bold.woff2
│   │   │   │   ├── Roboto-Slab-Regular.woff
│   │   │   │   └── Roboto-Slab-Regular.woff2
│   │   │   └── theme.css
│   │   ├── doctools.js
│   │   ├── documentation_options.js
│   │   ├── file.png
│   │   ├── fonts
│   │   │   ├── Lato
│   │   │   │   ├── lato-bold.eot
│   │   │   │   ├── lato-bold.ttf
│   │   │   │   ├── lato-bold.woff
│   │   │   │   ├── lato-bold.woff2
│   │   │   │   ├── lato-bolditalic.eot
│   │   │   │   ├── lato-bolditalic.ttf
│   │   │   │   ├── lato-bolditalic.woff
│   │   │   │   ├── lato-bolditalic.woff2
│   │   │   │   ├── lato-italic.eot
│   │   │   │   ├── lato-italic.ttf
│   │   │   │   ├── lato-italic.woff
│   │   │   │   ├── lato-italic.woff2
│   │   │   │   ├── lato-regular.eot
│   │   │   │   ├── lato-regular.ttf
│   │   │   │   ├── lato-regular.woff
│   │   │   │   └── lato-regular.woff2
│   │   │   └── RobotoSlab
│   │   │       ├── roboto-slab-v7-bold.eot
│   │   │       ├── roboto-slab-v7-bold.ttf
│   │   │       ├── roboto-slab-v7-bold.woff
│   │   │       ├── roboto-slab-v7-bold.woff2
│   │   │       ├── roboto-slab-v7-regular.eot
│   │   │       ├── roboto-slab-v7-regular.ttf
│   │   │       ├── roboto-slab-v7-regular.woff
│   │   │       └── roboto-slab-v7-regular.woff2
│   │   ├── images
│   │   │   ├── interactive.document.png
│   │   │   ├── interactive.document.webp
│   │   │   ├── interactive.search.png
│   │   │   ├── interactive.search.webp
│   │   │   └── mcpswc.svg
│   │   ├── jquery.js
│   │   ├── js
│   │   │   ├── badge_only.js
│   │   │   ├── theme.js
│   │   │   └── versions.js
│   │   ├── language_data.js
│   │   ├── minus.png
│   │   ├── plus.png
│   │   ├── pygments.css
│   │   ├── searchtools.js
│   │   └── sphinx_highlight.js
│   ├── .buildinfo
│   ├── .nojekyll
│   ├── genindex.html
│   ├── guides
│   │   ├── archivebox.html
│   │   ├── httrack.html
│   │   ├── interrobot.html
│   │   ├── katana.html
│   │   ├── siteone.html
│   │   ├── warc.html
│   │   └── wget.html
│   ├── guides.html
│   ├── index.html
│   ├── installation.html
│   ├── interactive.html
│   ├── mcp_server_webcrawl.crawlers.archivebox.html
│   ├── mcp_server_webcrawl.crawlers.base.html
│   ├── mcp_server_webcrawl.crawlers.html
│   ├── mcp_server_webcrawl.crawlers.httrack.html
│   ├── mcp_server_webcrawl.crawlers.interrobot.html
│   ├── mcp_server_webcrawl.crawlers.katana.html
│   ├── mcp_server_webcrawl.crawlers.siteone.html
│   ├── mcp_server_webcrawl.crawlers.warc.html
│   ├── mcp_server_webcrawl.crawlers.wget.html
│   ├── mcp_server_webcrawl.extras.html
│   ├── mcp_server_webcrawl.html
│   ├── mcp_server_webcrawl.interactive.html
│   ├── mcp_server_webcrawl.models.html
│   ├── mcp_server_webcrawl.templates.html
│   ├── mcp_server_webcrawl.utils.html
│   ├── modules.html
│   ├── objects.inv
│   ├── prompts.html
│   ├── py-modindex.html
│   ├── search.html
│   ├── searchindex.js
│   └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│   ├── audit404.md
│   ├── auditfiles.md
│   ├── auditperf.md
│   ├── auditseo.md
│   ├── gopher.md
│   ├── README.md
│   └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│   ├── _static
│   │   └── images
│   │       ├── interactive.document.png
│   │       ├── interactive.document.webp
│   │       ├── interactive.search.png
│   │       ├── interactive.search.webp
│   │       └── mcpswc.svg
│   ├── _templates
│   │   └── layout.html
│   ├── conf.py
│   ├── guides
│   │   ├── archivebox.rst
│   │   ├── httrack.rst
│   │   ├── interrobot.rst
│   │   ├── katana.rst
│   │   ├── siteone.rst
│   │   ├── warc.rst
│   │   └── wget.rst
│   ├── guides.rst
│   ├── index.rst
│   ├── installation.rst
│   ├── interactive.rst
│   ├── make.bat
│   ├── Makefile
│   ├── mcp_server_webcrawl.crawlers.archivebox.rst
│   ├── mcp_server_webcrawl.crawlers.base.rst
│   ├── mcp_server_webcrawl.crawlers.httrack.rst
│   ├── mcp_server_webcrawl.crawlers.interrobot.rst
│   ├── mcp_server_webcrawl.crawlers.katana.rst
│   ├── mcp_server_webcrawl.crawlers.rst
│   ├── mcp_server_webcrawl.crawlers.siteone.rst
│   ├── mcp_server_webcrawl.crawlers.warc.rst
│   ├── mcp_server_webcrawl.crawlers.wget.rst
│   ├── mcp_server_webcrawl.extras.rst
│   ├── mcp_server_webcrawl.interactive.rst
│   ├── mcp_server_webcrawl.models.rst
│   ├── mcp_server_webcrawl.rst
│   ├── mcp_server_webcrawl.templates.rst
│   ├── mcp_server_webcrawl.utils.rst
│   ├── modules.rst
│   ├── prompts.rst
│   ├── readme.txt
│   └── usage.rst
└── src
    └── mcp_server_webcrawl
        ├── __init__.py
        ├── crawlers
        │   ├── __init__.py
        │   ├── archivebox
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── base
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── api.py
        │   │   ├── crawler.py
        │   │   ├── indexed.py
        │   │   └── tests.py
        │   ├── httrack
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── interrobot
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── katana
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── siteone
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   ├── warc
        │   │   ├── __init__.py
        │   │   ├── adapter.py
        │   │   ├── crawler.py
        │   │   └── tests.py
        │   └── wget
        │       ├── __init__.py
        │       ├── adapter.py
        │       ├── crawler.py
        │       └── tests.py
        ├── extras
        │   ├── __init__.py
        │   ├── markdown.py
        │   ├── regex.py
        │   ├── snippets.py
        │   ├── thumbnails.py
        │   └── xpath.py
        ├── interactive
        │   ├── __init__.py
        │   ├── highlights.py
        │   ├── search.py
        │   ├── session.py
        │   ├── ui.py
        │   └── views
        │       ├── base.py
        │       ├── document.py
        │       ├── help.py
        │       ├── requirements.py
        │       ├── results.py
        │       └── searchform.py
        ├── main.py
        ├── models
        │   ├── __init__.py
        │   ├── base.py
        │   ├── resources.py
        │   └── sites.py
        ├── settings.py
        ├── templates
        │   ├── __init__.py
        │   ├── markdown.xslt
        │   ├── tests_core.html
        │   └── tests.py
        └── utils
            ├── __init__.py
            ├── cli.py
            ├── logger.py
            ├── parser.py
            ├── parsetab.py
            ├── search.py
            ├── server.py
            ├── tests.py
            └── tools.py
```

# Files

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/katana/tests.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.crawlers.katana.tests &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="../../../../index.html">mcp-server-webcrawl</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 65 |           <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
 66 |           <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
 67 |       <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.katana.tests</li>
 68 |       <li class="wy-breadcrumbs-aside">
 69 |       </li>
 70 |   </ul>
 71 |   <hr/>
 72 | </div>
 73 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 74 |            <div itemprop="articleBody">
 75 |              
 76 |   <h1>Source code for mcp_server_webcrawl.crawlers.katana.tests</h1><div class="highlight"><pre>
 77 | <span></span><span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
 78 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.crawler</span> <span class="kn">import</span> <span class="n">KatanaCrawler</span>
 79 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.katana.adapter</span> <span class="kn">import</span> <span class="n">KatanaManager</span>
 80 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.adapter</span> <span class="kn">import</span> <span class="n">SitesGroup</span>
 81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.tests</span> <span class="kn">import</span> <span class="n">BaseCrawlerTests</span>
 82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers</span> <span class="kn">import</span> <span class="n">get_fixture_directory</span>
 83 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 84 | 
 85 | <span class="c1"># calculate ids for test directories using the same hash function as adapter</span>
 86 | <span class="n">EXAMPLE_SITE_ID</span> <span class="o">=</span> <span class="n">KatanaManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">&quot;example.com&quot;</span><span class="p">)</span>
 87 | <span class="n">PRAGMAR_SITE_ID</span> <span class="o">=</span> <span class="n">KatanaManager</span><span class="o">.</span><span class="n">string_to_id</span><span class="p">(</span><span class="s2">&quot;pragmar.com&quot;</span><span class="p">)</span>
 88 | 
 89 | <span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 90 | 
 91 | <div class="viewcode-block" id="KatanaTests">
 92 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests">[docs]</a>
 93 | <span class="k">class</span> <span class="nc">KatanaTests</span><span class="p">(</span><span class="n">BaseCrawlerTests</span><span class="p">):</span>
 94 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 95 | <span class="sd">    test suite for the HTTP text crawler implementation.</span>
 96 | <span class="sd">    tests parsing and retrieval of web content from HTTP text files.</span>
 97 | <span class="sd">    &quot;&quot;&quot;</span>
 98 | 
 99 | <div class="viewcode-block" id="KatanaTests.setUp">
100 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.setUp">[docs]</a>
101 |     <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
102 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
103 | <span class="sd">        set up the test environment with fixture data.</span>
104 | <span class="sd">        &quot;&quot;&quot;</span>
105 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span>
106 |         <span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span> <span class="o">=</span> <span class="n">get_fixture_directory</span><span class="p">()</span> <span class="o">/</span> <span class="s2">&quot;katana&quot;</span></div>
107 | 
108 | 
109 | <div class="viewcode-block" id="KatanaTests.test_katana_pulse">
110 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_pulse">[docs]</a>
111 |     <span class="k">def</span> <span class="nf">test_katana_pulse</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
112 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
113 | <span class="sd">        basic crawler initialization.</span>
114 | <span class="sd">        &quot;&quot;&quot;</span>
115 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
116 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">crawler</span><span class="p">)</span>
117 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="o">.</span><span class="n">is_dir</span><span class="p">())</span></div>
118 | 
119 | 
120 | <div class="viewcode-block" id="KatanaTests.test_katana_sites">
121 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_sites">[docs]</a>
122 |     <span class="k">def</span> <span class="nf">test_katana_sites</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
123 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
124 | <span class="sd">        site retrieval API functionality.</span>
125 | <span class="sd">        &quot;&quot;&quot;</span>
126 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
127 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_site_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
128 | 
129 | 
130 | <div class="viewcode-block" id="KatanaTests.test_katana_search">
131 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_search">[docs]</a>
132 |     <span class="k">def</span> <span class="nf">test_katana_search</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
133 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
134 | <span class="sd">        boolean search tests</span>
135 | <span class="sd">        &quot;&quot;&quot;</span>
136 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
137 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_search_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
138 | 
139 | 
140 | <div class="viewcode-block" id="KatanaTests.test_pragmar_tokenizer">
141 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_pragmar_tokenizer">[docs]</a>
142 |     <span class="k">def</span> <span class="nf">test_pragmar_tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
143 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
144 | <span class="sd">        tokenizer search tests</span>
145 | <span class="sd">        &quot;&quot;&quot;</span>
146 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
147 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_tokenizer_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
148 | 
149 | 
150 | 
151 | <div class="viewcode-block" id="KatanaTests.test_katana_resources">
152 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_resources">[docs]</a>
153 |     <span class="k">def</span> <span class="nf">test_katana_resources</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
154 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
155 | <span class="sd">        resource retrieval API functionality with various parameters.</span>
156 | <span class="sd">        &quot;&quot;&quot;</span>
157 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
158 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_sites_resources_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="n">EXAMPLE_SITE_ID</span><span class="p">)</span></div>
159 | 
160 | 
161 | <div class="viewcode-block" id="KatanaTests.test_interrobot_images">
162 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_interrobot_images">[docs]</a>
163 |     <span class="k">def</span> <span class="nf">test_interrobot_images</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
164 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
165 | <span class="sd">        Test InterroBot-specific image handling and thumbnails.</span>
166 | <span class="sd">        &quot;&quot;&quot;</span>
167 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
168 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_image_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
169 | 
170 | 
171 | <div class="viewcode-block" id="KatanaTests.test_katana_sorts">
172 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_sorts">[docs]</a>
173 |     <span class="k">def</span> <span class="nf">test_katana_sorts</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
174 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
175 | <span class="sd">        random sort functionality using the &#39;?&#39; sort parameter.</span>
176 | <span class="sd">        &quot;&quot;&quot;</span>
177 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
178 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_sort_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">)</span></div>
179 | 
180 | 
181 | <div class="viewcode-block" id="KatanaTests.test_katana_content_parsing">
182 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_katana_content_parsing">[docs]</a>
183 |     <span class="k">def</span> <span class="nf">test_katana_content_parsing</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
184 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
185 | <span class="sd">        content type detection and parsing for HTTP text files.</span>
186 | <span class="sd">        &quot;&quot;&quot;</span>
187 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
188 |         <span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_content_tests</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
189 | 
190 | 
191 | <div class="viewcode-block" id="KatanaTests.test_report">
192 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.katana.html#mcp_server_webcrawl.crawlers.katana.tests.KatanaTests.test_report">[docs]</a>
193 |     <span class="k">def</span> <span class="nf">test_report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
194 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
195 | <span class="sd">        Run test report, save to data directory.</span>
196 | <span class="sd">        &quot;&quot;&quot;</span>
197 |         <span class="n">crawler</span> <span class="o">=</span> <span class="n">KatanaCrawler</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_datasrc</span><span class="p">)</span>
198 |         <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">run_pragmar_report</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">PRAGMAR_SITE_ID</span><span class="p">,</span> <span class="s2">&quot;Katana&quot;</span><span class="p">))</span></div>
199 | </div>
200 | 
201 | </pre></div>
202 | 
203 |            </div>
204 |           </div>
205 |           <footer>
206 | 
207 |   <hr/>
208 | 
209 |   <div role="contentinfo">
210 |     <p>&#169; Copyright 2025, pragmar.</p>
211 |   </div>
212 | 
213 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
214 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
215 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
216 |    
217 | 
218 | </footer>
219 |         </div>
220 |       </div>
221 |     </section>
222 |   </div>
223 |   <script>
224 |       jQuery(function () {
225 |           SphinxRtdTheme.Navigation.enable(true);
226 |       });
227 |   </script> 
228 | 
229 | </body>
230 | </html>
```

--------------------------------------------------------------------------------
/docs/_static/searchtools.js:
--------------------------------------------------------------------------------

```javascript
  1 | /*
  2 |  * searchtools.js
  3 |  * ~~~~~~~~~~~~~~~~
  4 |  *
  5 |  * Sphinx JavaScript utilities for the full-text search.
  6 |  *
  7 |  * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 | "use strict";
 12 | 
 13 | /**
 14 |  * Simple result scoring code.
 15 |  */
 16 | if (typeof Scorer === "undefined") {
 17 |   var Scorer = {
 18 |     // Implement the following function to further tweak the score for each result
 19 |     // The function takes a result array [docname, title, anchor, descr, score, filename]
 20 |     // and returns the new score.
 21 |     /*
 22 |     score: result => {
 23 |       const [docname, title, anchor, descr, score, filename] = result
 24 |       return score
 25 |     },
 26 |     */
 27 | 
 28 |     // query matches the full name of an object
 29 |     objNameMatch: 11,
 30 |     // or matches in the last dotted part of the object name
 31 |     objPartialMatch: 6,
 32 |     // Additive scores depending on the priority of the object
 33 |     objPrio: {
 34 |       0: 15, // used to be importantResults
 35 |       1: 5, // used to be objectResults
 36 |       2: -5, // used to be unimportantResults
 37 |     },
 38 |     //  Used when the priority is not in the mapping.
 39 |     objPrioDefault: 0,
 40 | 
 41 |     // query found in title
 42 |     title: 15,
 43 |     partialTitle: 7,
 44 |     // query found in terms
 45 |     term: 5,
 46 |     partialTerm: 2,
 47 |   };
 48 | }
 49 | 
 50 | const _removeChildren = (element) => {
 51 |   while (element && element.lastChild) element.removeChild(element.lastChild);
 52 | };
 53 | 
 54 | /**
 55 |  * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping
 56 |  */
 57 | const _escapeRegExp = (string) =>
 58 |   string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
 59 | 
 60 | const _displayItem = (item, searchTerms, highlightTerms) => {
 61 |   const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
 62 |   const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
 63 |   const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
 64 |   const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
 65 |   const contentRoot = document.documentElement.dataset.content_root;
 66 | 
 67 |   const [docName, title, anchor, descr, score, _filename] = item;
 68 | 
 69 |   let listItem = document.createElement("li");
 70 |   let requestUrl;
 71 |   let linkUrl;
 72 |   if (docBuilder === "dirhtml") {
 73 |     // dirhtml builder
 74 |     let dirname = docName + "/";
 75 |     if (dirname.match(/\/index\/$/))
 76 |       dirname = dirname.substring(0, dirname.length - 6);
 77 |     else if (dirname === "index/") dirname = "";
 78 |     requestUrl = contentRoot + dirname;
 79 |     linkUrl = requestUrl;
 80 |   } else {
 81 |     // normal html builders
 82 |     requestUrl = contentRoot + docName + docFileSuffix;
 83 |     linkUrl = docName + docLinkSuffix;
 84 |   }
 85 |   let linkEl = listItem.appendChild(document.createElement("a"));
 86 |   linkEl.href = linkUrl + anchor;
 87 |   linkEl.dataset.score = score;
 88 |   linkEl.innerHTML = title;
 89 |   if (descr) {
 90 |     listItem.appendChild(document.createElement("span")).innerHTML =
 91 |       " (" + descr + ")";
 92 |     // highlight search terms in the description
 93 |     if (SPHINX_HIGHLIGHT_ENABLED)  // set in sphinx_highlight.js
 94 |       highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
 95 |   }
 96 |   else if (showSearchSummary)
 97 |     fetch(requestUrl)
 98 |       .then((responseData) => responseData.text())
 99 |       .then((data) => {
100 |         if (data)
101 |           listItem.appendChild(
102 |             Search.makeSearchSummary(data, searchTerms)
103 |           );
104 |         // highlight search terms in the summary
105 |         if (SPHINX_HIGHLIGHT_ENABLED)  // set in sphinx_highlight.js
106 |           highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
107 |       });
108 |   Search.output.appendChild(listItem);
109 | };
110 | const _finishSearch = (resultCount) => {
111 |   Search.stopPulse();
112 |   Search.title.innerText = _("Search Results");
113 |   if (!resultCount)
114 |     Search.status.innerText = Documentation.gettext(
115 |       "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories."
116 |     );
117 |   else
118 |     Search.status.innerText = _(
119 |       `Search finished, found ${resultCount} page(s) matching the search query.`
120 |     );
121 | };
122 | const _displayNextItem = (
123 |   results,
124 |   resultCount,
125 |   searchTerms,
126 |   highlightTerms,
127 | ) => {
128 |   // results left, load the summary and display it
129 |   // this is intended to be dynamic (don't sub resultsCount)
130 |   if (results.length) {
131 |     _displayItem(results.pop(), searchTerms, highlightTerms);
132 |     setTimeout(
133 |       () => _displayNextItem(results, resultCount, searchTerms, highlightTerms),
134 |       5
135 |     );
136 |   }
137 |   // search finished, update title and status message
138 |   else _finishSearch(resultCount);
139 | };
140 | 
141 | /**
142 |  * Default splitQuery function. Can be overridden in ``sphinx.search`` with a
143 |  * custom function per language.
144 |  *
145 |  * The regular expression works by splitting the string on consecutive characters
146 |  * that are not Unicode letters, numbers, underscores, or emoji characters.
147 |  * This is the same as ``\W+`` in Python, preserving the surrogate pair area.
148 |  */
149 | if (typeof splitQuery === "undefined") {
150 |   var splitQuery = (query) => query
151 |       .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
152 |       .filter(term => term)  // remove remaining empty strings
153 | }
154 | 
155 | /**
156 |  * Search Module
157 |  */
158 | const Search = {
159 |   _index: null,
160 |   _queued_query: null,
161 |   _pulse_status: -1,
162 | 
163 |   htmlToText: (htmlString) => {
164 |     const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
165 |     htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
166 |     const docContent = htmlElement.querySelector('[role="main"]');
167 |     if (docContent !== undefined) return docContent.textContent;
168 |     console.warn(
169 |       "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
170 |     );
171 |     return "";
172 |   },
173 | 
174 |   init: () => {
175 |     const query = new URLSearchParams(window.location.search).get("q");
176 |     document
177 |       .querySelectorAll('input[name="q"]')
178 |       .forEach((el) => (el.value = query));
179 |     if (query) Search.performSearch(query);
180 |   },
181 | 
182 |   loadIndex: (url) =>
183 |     (document.body.appendChild(document.createElement("script")).src = url),
184 | 
185 |   setIndex: (index) => {
186 |     Search._index = index;
187 |     if (Search._queued_query !== null) {
188 |       const query = Search._queued_query;
189 |       Search._queued_query = null;
190 |       Search.query(query);
191 |     }
192 |   },
193 | 
194 |   hasIndex: () => Search._index !== null,
195 | 
196 |   deferQuery: (query) => (Search._queued_query = query),
197 | 
198 |   stopPulse: () => (Search._pulse_status = -1),
199 | 
200 |   startPulse: () => {
201 |     if (Search._pulse_status >= 0) return;
202 | 
203 |     const pulse = () => {
204 |       Search._pulse_status = (Search._pulse_status + 1) % 4;
205 |       Search.dots.innerText = ".".repeat(Search._pulse_status);
206 |       if (Search._pulse_status >= 0) window.setTimeout(pulse, 500);
207 |     };
208 |     pulse();
209 |   },
210 | 
211 |   /**
212 |    * perform a search for something (or wait until index is loaded)
213 |    */
214 |   performSearch: (query) => {
215 |     // create the required interface elements
216 |     const searchText = document.createElement("h2");
217 |     searchText.textContent = _("Searching");
218 |     const searchSummary = document.createElement("p");
219 |     searchSummary.classList.add("search-summary");
220 |     searchSummary.innerText = "";
221 |     const searchList = document.createElement("ul");
222 |     searchList.classList.add("search");
223 | 
224 |     const out = document.getElementById("search-results");
225 |     Search.title = out.appendChild(searchText);
226 |     Search.dots = Search.title.appendChild(document.createElement("span"));
227 |     Search.status = out.appendChild(searchSummary);
228 |     Search.output = out.appendChild(searchList);
229 | 
230 |     const searchProgress = document.getElementById("search-progress");
231 |     // Some themes don't use the search progress node
232 |     if (searchProgress) {
233 |       searchProgress.innerText = _("Preparing search...");
234 |     }
235 |     Search.startPulse();
236 | 
237 |     // index already loaded, the browser was quick!
238 |     if (Search.hasIndex()) Search.query(query);
239 |     else Search.deferQuery(query);
240 |   },
241 | 
242 |   /**
243 |    * execute search (requires search index to be loaded)
244 |    */
245 |   query: (query) => {
246 |     const filenames = Search._index.filenames;
247 |     const docNames = Search._index.docnames;
248 |     const titles = Search._index.titles;
249 |     const allTitles = Search._index.alltitles;
250 |     const indexEntries = Search._index.indexentries;
251 | 
252 |     // stem the search terms and add them to the correct list
253 |     const stemmer = new Stemmer();
254 |     const searchTerms = new Set();
255 |     const excludedTerms = new Set();
256 |     const highlightTerms = new Set();
257 |     const objectTerms = new Set(splitQuery(query.toLowerCase().trim()));
258 |     splitQuery(query.trim()).forEach((queryTerm) => {
259 |       const queryTermLower = queryTerm.toLowerCase();
260 | 
261 |       // maybe skip this "word"
262 |       // stopwords array is from language_data.js
263 |       if (
264 |         stopwords.indexOf(queryTermLower) !== -1 ||
265 |         queryTerm.match(/^\d+$/)
266 |       )
267 |         return;
268 | 
269 |       // stem the word
270 |       let word = stemmer.stemWord(queryTermLower);
271 |       // select the correct list
272 |       if (word[0] === "-") excludedTerms.add(word.substr(1));
273 |       else {
274 |         searchTerms.add(word);
275 |         highlightTerms.add(queryTermLower);
276 |       }
277 |     });
278 | 
279 |     if (SPHINX_HIGHLIGHT_ENABLED) {  // set in sphinx_highlight.js
280 |       localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" "))
281 |     }
282 | 
283 |     // console.debug("SEARCH: searching for:");
284 |     // console.info("required: ", [...searchTerms]);
285 |     // console.info("excluded: ", [...excludedTerms]);
286 | 
287 |     // array of [docname, title, anchor, descr, score, filename]
288 |     let results = [];
289 |     _removeChildren(document.getElementById("search-progress"));
290 | 
291 |     const queryLower = query.toLowerCase();
292 |     for (const [title, foundTitles] of Object.entries(allTitles)) {
293 |       if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
294 |         for (const [file, id] of foundTitles) {
295 |           let score = Math.round(100 * queryLower.length / title.length)
296 |           results.push([
297 |             docNames[file],
298 |             titles[file] !== title ? `${titles[file]} > ${title}` : title,
299 |             id !== null ? "#" + id : "",
300 |             null,
301 |             score,
302 |             filenames[file],
303 |           ]);
304 |         }
305 |       }
306 |     }
307 | 
308 |     // search for explicit entries in index directives
309 |     for (const [entry, foundEntries] of Object.entries(indexEntries)) {
310 |       if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
311 |         for (const [file, id] of foundEntries) {
312 |           let score = Math.round(100 * queryLower.length / entry.length)
313 |           results.push([
314 |             docNames[file],
315 |             titles[file],
316 |             id ? "#" + id : "",
317 |             null,
318 |             score,
319 |             filenames[file],
320 |           ]);
321 |         }
322 |       }
323 |     }
324 | 
325 |     // lookup as object
326 |     objectTerms.forEach((term) =>
327 |       results.push(...Search.performObjectSearch(term, objectTerms))
328 |     );
329 | 
330 |     // lookup as search terms in fulltext
331 |     results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
332 | 
333 |     // let the scorer override scores with a custom scoring function
334 |     if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
335 | 
336 |     // now sort the results by score (in opposite order of appearance, since the
337 |     // display function below uses pop() to retrieve items) and then
338 |     // alphabetically
339 |     results.sort((a, b) => {
340 |       const leftScore = a[4];
341 |       const rightScore = b[4];
342 |       if (leftScore === rightScore) {
343 |         // same score: sort alphabetically
344 |         const leftTitle = a[1].toLowerCase();
345 |         const rightTitle = b[1].toLowerCase();
346 |         if (leftTitle === rightTitle) return 0;
347 |         return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
348 |       }
349 |       return leftScore > rightScore ? 1 : -1;
350 |     });
351 | 
352 |     // remove duplicate search results
353 |     // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
354 |     let seen = new Set();
355 |     results = results.reverse().reduce((acc, result) => {
356 |       let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(',');
357 |       if (!seen.has(resultStr)) {
358 |         acc.push(result);
359 |         seen.add(resultStr);
360 |       }
361 |       return acc;
362 |     }, []);
363 | 
364 |     results = results.reverse();
365 | 
366 |     // for debugging
367 |     //Search.lastresults = results.slice();  // a copy
368 |     // console.info("search results:", Search.lastresults);
369 | 
370 |     // print the results
371 |     _displayNextItem(results, results.length, searchTerms, highlightTerms);
372 |   },
373 | 
374 |   /**
375 |    * search for object names
376 |    */
377 |   performObjectSearch: (object, objectTerms) => {
378 |     const filenames = Search._index.filenames;
379 |     const docNames = Search._index.docnames;
380 |     const objects = Search._index.objects;
381 |     const objNames = Search._index.objnames;
382 |     const titles = Search._index.titles;
383 | 
384 |     const results = [];
385 | 
386 |     const objectSearchCallback = (prefix, match) => {
387 |       const name = match[4]
388 |       const fullname = (prefix ? prefix + "." : "") + name;
389 |       const fullnameLower = fullname.toLowerCase();
390 |       if (fullnameLower.indexOf(object) < 0) return;
391 | 
392 |       let score = 0;
393 |       const parts = fullnameLower.split(".");
394 | 
395 |       // check for different match types: exact matches of full name or
396 |       // "last name" (i.e. last dotted part)
397 |       if (fullnameLower === object || parts.slice(-1)[0] === object)
398 |         score += Scorer.objNameMatch;
399 |       else if (parts.slice(-1)[0].indexOf(object) > -1)
400 |         score += Scorer.objPartialMatch; // matches in last name
401 | 
402 |       const objName = objNames[match[1]][2];
403 |       const title = titles[match[0]];
404 | 
405 |       // If more than one term searched for, we require other words to be
406 |       // found in the name/title/description
407 |       const otherTerms = new Set(objectTerms);
408 |       otherTerms.delete(object);
409 |       if (otherTerms.size > 0) {
410 |         const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase();
411 |         if (
412 |           [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0)
413 |         )
414 |           return;
415 |       }
416 | 
417 |       let anchor = match[3];
418 |       if (anchor === "") anchor = fullname;
419 |       else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname;
420 | 
421 |       const descr = objName + _(", in ") + title;
422 | 
423 |       // add custom score for some objects according to scorer
424 |       if (Scorer.objPrio.hasOwnProperty(match[2]))
425 |         score += Scorer.objPrio[match[2]];
426 |       else score += Scorer.objPrioDefault;
427 | 
428 |       results.push([
429 |         docNames[match[0]],
430 |         fullname,
431 |         "#" + anchor,
432 |         descr,
433 |         score,
434 |         filenames[match[0]],
435 |       ]);
436 |     };
437 |     Object.keys(objects).forEach((prefix) =>
438 |       objects[prefix].forEach((array) =>
439 |         objectSearchCallback(prefix, array)
440 |       )
441 |     );
442 |     return results;
443 |   },
444 | 
445 |   /**
446 |    * search for full-text terms in the index
447 |    */
448 |   performTermsSearch: (searchTerms, excludedTerms) => {
449 |     // prepare search
450 |     const terms = Search._index.terms;
451 |     const titleTerms = Search._index.titleterms;
452 |     const filenames = Search._index.filenames;
453 |     const docNames = Search._index.docnames;
454 |     const titles = Search._index.titles;
455 | 
456 |     const scoreMap = new Map();
457 |     const fileMap = new Map();
458 | 
459 |     // perform the search on the required terms
460 |     searchTerms.forEach((word) => {
461 |       const files = [];
462 |       const arr = [
463 |         { files: terms[word], score: Scorer.term },
464 |         { files: titleTerms[word], score: Scorer.title },
465 |       ];
466 |       // add support for partial matches
467 |       if (word.length > 2) {
468 |         const escapedWord = _escapeRegExp(word);
469 |         Object.keys(terms).forEach((term) => {
470 |           if (term.match(escapedWord) && !terms[word])
471 |             arr.push({ files: terms[term], score: Scorer.partialTerm });
472 |         });
473 |         Object.keys(titleTerms).forEach((term) => {
474 |           if (term.match(escapedWord) && !titleTerms[word])
475 |             arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
476 |         });
477 |       }
478 | 
479 |       // no match but word was a required one
480 |       if (arr.every((record) => record.files === undefined)) return;
481 | 
482 |       // found search word in contents
483 |       arr.forEach((record) => {
484 |         if (record.files === undefined) return;
485 | 
486 |         let recordFiles = record.files;
487 |         if (recordFiles.length === undefined) recordFiles = [recordFiles];
488 |         files.push(...recordFiles);
489 | 
490 |         // set score for the word in each file
491 |         recordFiles.forEach((file) => {
492 |           if (!scoreMap.has(file)) scoreMap.set(file, {});
493 |           scoreMap.get(file)[word] = record.score;
494 |         });
495 |       });
496 | 
497 |       // create the mapping
498 |       files.forEach((file) => {
499 |         if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
500 |           fileMap.get(file).push(word);
501 |         else fileMap.set(file, [word]);
502 |       });
503 |     });
504 | 
505 |     // now check if the files don't contain excluded terms
506 |     const results = [];
507 |     for (const [file, wordList] of fileMap) {
508 |       // check if all requirements are matched
509 | 
510 |       // as search terms with length < 3 are discarded
511 |       const filteredTermCount = [...searchTerms].filter(
512 |         (term) => term.length > 2
513 |       ).length;
514 |       if (
515 |         wordList.length !== searchTerms.size &&
516 |         wordList.length !== filteredTermCount
517 |       )
518 |         continue;
519 | 
520 |       // ensure that none of the excluded terms is in the search result
521 |       if (
522 |         [...excludedTerms].some(
523 |           (term) =>
524 |             terms[term] === file ||
525 |             titleTerms[term] === file ||
526 |             (terms[term] || []).includes(file) ||
527 |             (titleTerms[term] || []).includes(file)
528 |         )
529 |       )
530 |         break;
531 | 
532 |       // select one (max) score for the file.
533 |       const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w]));
534 |       // add result to the result list
535 |       results.push([
536 |         docNames[file],
537 |         titles[file],
538 |         "",
539 |         null,
540 |         score,
541 |         filenames[file],
542 |       ]);
543 |     }
544 |     return results;
545 |   },
546 | 
547 |   /**
548 |    * helper function to return a node containing the
549 |    * search summary for a given text. keywords is a list
550 |    * of stemmed words.
551 |    */
552 |   makeSearchSummary: (htmlText, keywords) => {
553 |     const text = Search.htmlToText(htmlText);
554 |     if (text === "") return null;
555 | 
556 |     const textLower = text.toLowerCase();
557 |     const actualStartPosition = [...keywords]
558 |       .map((k) => textLower.indexOf(k.toLowerCase()))
559 |       .filter((i) => i > -1)
560 |       .slice(-1)[0];
561 |     const startWithContext = Math.max(actualStartPosition - 120, 0);
562 | 
563 |     const top = startWithContext === 0 ? "" : "...";
564 |     const tail = startWithContext + 240 < text.length ? "..." : "";
565 | 
566 |     let summary = document.createElement("p");
567 |     summary.classList.add("context");
568 |     summary.textContent = top + text.substr(startWithContext, 240).trim() + tail;
569 | 
570 |     return summary;
571 |   },
572 | };
573 | 
574 | _ready(Search.init);
575 | 
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/session.py:
--------------------------------------------------------------------------------

```python
  1 | import curses
  2 | import sys
  3 | import threading
  4 | import traceback
  5 | 
  6 | from pathlib import Path
  7 | from typing import Optional
  8 | 
  9 | from mcp_server_webcrawl.crawlers import get_crawler
 10 | from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler, BaseJsonApi
 11 | from mcp_server_webcrawl.interactive.search import SearchManager
 12 | from mcp_server_webcrawl.interactive.ui import ThemeDefinition, UiState, DocumentMode, UiFocusable, ViewBounds, safe_addstr
 13 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView, OUTER_WIDTH_RIGHT_MARGIN
 14 | from mcp_server_webcrawl.interactive.views.document import SearchDocumentView
 15 | from mcp_server_webcrawl.interactive.views.requirements import RequirementsView
 16 | from mcp_server_webcrawl.interactive.views.results import SearchResultsView
 17 | from mcp_server_webcrawl.interactive.views.searchform import SearchFormView
 18 | from mcp_server_webcrawl.interactive.views.help import HelpView
 19 | from mcp_server_webcrawl.models.sites import SiteResult
 20 | 
 21 | # can be as low as 1, 50 feels a little laggy
 22 | CURSES_TIMEOUT_MS = 25
 23 | 
 24 | LAYOUT_CONTENT_START_Y_OFFSET = 1
 25 | LAYOUT_CONTENT_END_Y_OFFSET = 1
 26 | LAYOUT_SPLIT_PANE_MAX_HEIGHT = 10
 27 | LAYOUT_MIN_HEIGHT_FOR_HELP = 2
 28 | 
 29 | DEBUG_MAX_LINES = 8
 30 | DEBUG_COMPACT_WIDTH_RATIO = 0.4
 31 | DEBUG_MIN_COMPACT_WIDTH = 30
 32 | DEBUG_COMPACT_THRESHOLD = 5
 33 | DEBUG_EXPANDED_MARGIN = 6
 34 | DEBUG_EXPANDED_START_X = 3
 35 | DEBUG_EXPANDED_BOTTOM_MARGIN = 3
 36 | DEBUG_COMPACT_BOTTOM_MARGIN = 2
 37 | DEBUG_MIN_START_Y = 1
 38 | DEBUG_MIN_START_Y_EXPANDED = 2
 39 | 
 40 | SEARCH_DOCUMENT_NEXT_MODE: dict[DocumentMode, DocumentMode] = {
 41 |     DocumentMode.MARKDOWN: DocumentMode.RAW,
 42 |     DocumentMode.RAW: DocumentMode.HEADERS,
 43 |     DocumentMode.HEADERS: DocumentMode.MARKDOWN
 44 | }
 45 | 
 46 | SEARCH_RESULT_LIMIT: int = 10
 47 | TERMINAL_MIN_HEIGHT: int = 8
 48 | TERMINAL_MIN_WIDTH: int = 40
 49 | 
 50 | class InteractiveSession:
 51 |     """
 52 |     Main session coordinator that manages the interactive terminal application.
 53 |     """
 54 | 
 55 |     def __init__(self, crawler: str, datasrc: str):
 56 |         """
 57 |         Initialize the interactive session with crawler and data source.
 58 |         """
 59 |         self.__input_crawler: str = crawler
 60 |         self.__input_datasrc: str = datasrc
 61 |         self.__theme_map: dict[str, int] = {}
 62 |         self.__searchman: SearchManager = SearchManager(self)
 63 |         self.__ui_state: UiState = UiState.SEARCH_INIT
 64 |         self.__ui_focused: UiFocusable = UiFocusable.SEARCH_FORM
 65 |         self.__debug: list[str] = []
 66 | 
 67 |         self.__view__requirements = RequirementsView(self, crawler, datasrc)
 68 |         if self.__view__requirements.validated == True:
 69 |             crawl_model = get_crawler(crawler)
 70 |             if crawl_model is not None:
 71 |                 self.__crawler: BaseCrawler = crawl_model(Path(datasrc))
 72 |                 sites_api: BaseJsonApi = self.__crawler.get_sites_api()
 73 |                 self.__sites: list[SiteResult] = sites_api.get_results()
 74 |             else:
 75 |                 self.__crawler: BaseCrawler = None
 76 |                 sites_api: BaseJsonApi = None
 77 |                 self.__sites: list[SiteResult] = []
 78 |         else:
 79 |             crawl_model = None
 80 |             self.__crawler: BaseCrawler = None
 81 |             sites_api: BaseJsonApi = None
 82 |             self.__sites: list[SiteResult] = []
 83 | 
 84 |         self.__view__results = SearchResultsView(self)
 85 |         self.__view__document = SearchDocumentView(self)
 86 |         self.__view__searchform = SearchFormView(self, self.__sites)
 87 |         self.__view__help = HelpView(self)
 88 | 
 89 |         self.set_ui_state(UiState.SEARCH_INIT, UiFocusable.SEARCH_FORM)
 90 | 
 91 |     @property
 92 |     def ui_state(self) ->  UiState:
 93 |         return self.__ui_state
 94 | 
 95 |     @property
 96 |     def ui_focused(self) ->  UiFocusable:
 97 |         return self.__ui_focused
 98 | 
 99 |     @property
100 |     def crawler(self) ->  BaseCrawler:
101 |         return self.__crawler
102 | 
103 |     @property
104 |     def document(self) -> SearchDocumentView:
105 |         return self.__view__document
106 | 
107 |     @property
108 |     def results(self) -> SearchResultsView:
109 |         return self.__view__results
110 | 
111 |     @property
112 |     def searchform(self) -> SearchFormView:
113 |         return self.__view__searchform
114 | 
115 |     @property
116 |     def searchman(self) -> SearchManager:
117 |         return self.__searchman
118 | 
119 |     @property
120 |     def sites(self) ->  list[SiteResult]:
121 |         return self.__sites.copy()
122 | 
123 |     def debug_add(self, msg: str) -> None:
124 |         """
125 |         Add line of debug.
126 |         """
127 |         with threading.Lock():
128 |             self.__debug.append(msg)
129 | 
130 |     def debug_clear(self) -> None:
131 |         """
132 |         Clear debug statements.
133 |         """
134 |         with threading.Lock():
135 |             self.__debug.clear()
136 | 
137 |     def run(self) -> None:
138 |         """
139 |         Public interface to launch the interactive terminal application.
140 |         """
141 |         try:
142 |             curses.wrapper(self.__curses_main)
143 |         except KeyboardInterrupt:
144 |             pass  # clean exit, ctrl+c
145 |         except Exception as ex:
146 |             print(f"--interactive failure: {ex}\n{traceback.format_exc()}", file=sys.stderr)
147 |         finally:
148 |             self.searchman.cleanup()
149 |             pass
150 | 
151 |     def set_ui_state(self, state: UiState, focus: Optional[UiFocusable] = None) -> None:
152 |         """
153 |         Transition between UI states cleanly.
154 |         """
155 |         self.__ui_state = state
156 |         if focus is not None:
157 |             self.__ui_focused = focus
158 | 
159 |         self.__view__results.set_focused(False)
160 |         self.__view__searchform.set_focused(False)
161 |         if state == UiState.SEARCH_INIT or (state == UiState.SEARCH_RESULTS and focus == UiFocusable.SEARCH_FORM):
162 |             self.__view__searchform.set_focused(True)
163 |         elif state == UiState.SEARCH_RESULTS:
164 |             self.__view__results.set_focused(True)
165 | 
166 |     # used in requirements view to reset with user inputs over cmd args
167 |     def set_init_input_args(self, crawler: str, datasrc: str) -> None:
168 |         self.__input_crawler = crawler
169 |         self.__input_datasrc = datasrc
170 | 
171 |     def set_init_crawler(self, crawler: BaseCrawler) -> None:
172 |         self.__crawler = crawler
173 | 
174 |     def set_init_sites(self, sites: str) -> None:
175 |         self.__sites = sites
176 | 
177 |     # used in requirements to reset app
178 |     def set_init_searchform(self, searchform: BaseCursesView) -> None:
179 |         self.__view__searchform = searchform
180 | 
181 |     def __get_outer_screen(self, width: int, height: int) -> ViewBounds:
182 |         """
183 |         Get the outer screen bounds for the full terminal.
184 |         """
185 |         return ViewBounds(
186 |             x=0,
187 |             y=0,
188 |             width=width - OUTER_WIDTH_RIGHT_MARGIN,
189 |             height=height
190 |         )
191 | 
192 |     def __get_inner_screen(self, width: int, height: int) -> ViewBounds:
193 |         """
194 |         Get the inner screen bounds for content area.
195 |         """
196 |         content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
197 |         content_end_y = height - LAYOUT_CONTENT_END_Y_OFFSET
198 |         content_height = content_end_y - content_start_y
199 | 
200 |         return ViewBounds(
201 |             x=0,
202 |             y=content_start_y,  # after outer header
203 |             width=width - OUTER_WIDTH_RIGHT_MARGIN,
204 |             height=content_height
205 |         )
206 | 
207 |     def __get_split_top(self, width: int, height: int) -> ViewBounds:
208 |         """
209 |         Get the top split screen bounds for dual-pane layout.
210 |         """
211 |         content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
212 |         content_height = height - 2
213 |         split_top_height = min(LAYOUT_SPLIT_PANE_MAX_HEIGHT, content_height // 2)
214 | 
215 |         return ViewBounds(
216 |             x=0,
217 |             y=content_start_y,
218 |             width=width - OUTER_WIDTH_RIGHT_MARGIN,
219 |             height=split_top_height
220 |         )
221 | 
222 |     def __get_split_bottom(self, width: int, height: int) -> ViewBounds:
223 |         """
224 |         Get the bottom split screen bounds for dual-pane layout.
225 |         """
226 |         content_start_y = LAYOUT_CONTENT_START_Y_OFFSET
227 |         content_height = height - 2
228 |         split_top_height = min(LAYOUT_SPLIT_PANE_MAX_HEIGHT, content_height // 2)
229 |         split_bottom_height = content_height - split_top_height
230 | 
231 |         return ViewBounds(
232 |             x=0,
233 |             y=content_start_y + split_top_height,
234 |             width=width - OUTER_WIDTH_RIGHT_MARGIN,
235 |             height=split_bottom_height
236 |         )
237 | 
238 |     def __curses_main(self, stdscr: curses.window) -> None:
239 |         """
240 |         Initialize curses environment and start main loop.
241 |         """
242 | 
243 |         if curses.COLORS < 256:
244 |             # display error in curses, dependable
245 |             stdscr.addstr(0, 0, "--interactive mode requires a 256-color (or better) terminal")
246 |             stdscr.refresh()
247 |             stdscr.getch()  # wait for keypress
248 |             sys.exit(1)
249 | 
250 |         # initialize curses style pairs
251 |         curses.start_color()
252 |         for theme in ThemeDefinition:
253 |             self.__theme_map[theme.name] = theme.value
254 |             curses.init_pair(*theme.value)
255 | 
256 |         # hide cursor, otherwise blinks at edge of last write
257 |         curses.curs_set(0)
258 | 
259 |         # start main loop
260 |         self.__interactive_loop(stdscr)
261 | 
262 |     def get_theme_color_pair(self, theme: ThemeDefinition) -> int | None:
263 |         if theme.name in self.__theme_map:
264 |             return curses.color_pair(self.__theme_map[theme.name][0])
265 |         else:
266 |             return None
267 | 
268 |     def __get_help_text(self) -> str:
269 |         """
270 |         Get context-sensitive help text.
271 |         """
272 |         page_results: str = " | ←→ Page Results" if self.ui_focused == UiFocusable.SEARCH_RESULTS else ""
273 |         search_results_enter: str = "Search" if self.__view__searchform.focused else "View Document"
274 |         search_results_tab: str = "Results" if self.__view__searchform.focused else "Search Form"
275 |         footers: dict[UiState, str] = {
276 |             UiState.DOCUMENT: "↑↓: Scroll | PgUp/PgDn: Page | Home/End: Top/Bot | TAB: Mode | ESC: Back",
277 |             UiState.HELP: "↑↓: Scroll | PgUp/PgDn: Page | Home/End: Top/Bot | ESC: Back",
278 |             UiState.REQUIREMENTS: "ENTER: Load Interface | ↑↓: Navigate| ESC: Exit",
279 |             UiState.SEARCH_INIT: "ENTER: Search | ↑↓: Navigate | F1: Search Help | ESC: Exit",
280 |             UiState.SEARCH_RESULTS: f"ENTER: {search_results_enter} | ↑↓: Navigate{page_results} | TAB: {search_results_tab} | ESC: New Search",
281 |         }
282 |         return footers.get(self.__ui_state, "↑↓: Navigate | ESC: Exit")
283 | 
284 |     def __handle_F1(self) -> None:
285 |         """
286 |         Handle F1 key
287 |         """
288 |         self.set_ui_state(UiState.HELP)
289 | 
290 |     def __handle_ESC(self) -> None:
291 |         """
292 |         Handle ESC key
293 |         """
294 |         if self.__ui_state == UiState.DOCUMENT:
295 |             self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
296 |         elif self.__ui_state in (UiState.SEARCH_RESULTS, UiState.HELP):
297 |             self.set_ui_state(UiState.SEARCH_INIT, UiFocusable.SEARCH_FORM)
298 |             self.searchform.clear_query()
299 |         elif self.__ui_state in (UiState.SEARCH_INIT, UiState.REQUIREMENTS):
300 |             sys.exit(0)
301 | 
302 |     def __handle_TAB(self) -> None:
303 |         """
304 |         Handle TAB key
305 |         """
306 |         if self.__ui_state == UiState.SEARCH_RESULTS:
307 |             if self.__ui_focused == UiFocusable.SEARCH_FORM:
308 |                 self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_RESULTS)
309 |             else:
310 |                 self.set_ui_state(UiState.SEARCH_RESULTS, UiFocusable.SEARCH_FORM)
311 | 
312 |     def __interactive_loop(self, stdscr: curses.window) -> None:
313 |         """
314 |         Main input loop.
315 |         """
316 | 
317 |         try:
318 |             stdscr.timeout(CURSES_TIMEOUT_MS)
319 | 
320 |             while True:
321 |                 self.searchman.check_pending()
322 | 
323 |                 stdscr.clear()
324 |                 height, width = stdscr.getmaxyx()
325 |                 selected_sites = self.__view__searchform.get_selected_sites()
326 | 
327 |                 if self.__ui_state == UiState.REQUIREMENTS or self.__view__requirements.validated == False:
328 | 
329 |                     if not self.__ui_state == UiState.REQUIREMENTS:
330 |                         self.set_ui_state(UiState.REQUIREMENTS)
331 | 
332 |                     inner_screen = self.__get_inner_screen(width, height)
333 |                     self.__view__requirements.draw_inner_header(stdscr, inner_screen, "Requirements:")
334 |                     self.__view__requirements.set_bounds(inner_screen)
335 |                     self.__view__requirements.render(stdscr)
336 |                     self.__view__requirements.draw_inner_footer(stdscr, inner_screen, f"Waiting on input")
337 | 
338 |                 elif self.__ui_state == UiState.HELP:
339 | 
340 |                     inner_screen = self.__get_inner_screen(width, height)
341 |                     self.__view__help.draw_inner_header(stdscr, inner_screen, "Search Help:")
342 |                     self.__view__help.set_bounds(inner_screen)
343 |                     self.__view__help.render(stdscr)
344 |                     self.__view__help.draw_inner_footer(stdscr, inner_screen, f"ESC to Exit Help")
345 | 
346 |                 elif self.__ui_state == UiState.SEARCH_RESULTS and selected_sites:
347 | 
348 |                     inner_screen_split_top = self.__get_split_top(width, height)
349 |                     inner_screen_split_bottom = self.__get_split_bottom(width, height)
350 |                     url: str = selected_sites[0].urls[0] if selected_sites and selected_sites[0].urls else ""
351 |                     display_url: str = BaseCursesView.url_for_display(url)
352 |                     self.__view__searchform.draw_inner_header(stdscr, inner_screen_split_top, "Search:")
353 |                     self.__view__searchform.set_bounds(inner_screen_split_top)
354 |                     self.__view__searchform.render(stdscr)
355 |                     self.__view__searchform.draw_inner_footer(stdscr, inner_screen_split_top, f"Searching {display_url}")
356 |                     self.__view__results.draw_inner_header(stdscr, inner_screen_split_bottom, "")
357 |                     self.__view__results.set_bounds(inner_screen_split_bottom)
358 |                     self.__view__results.render(stdscr)
359 |                     self.__view__results.draw_inner_footer(stdscr, inner_screen_split_bottom, "")
360 | 
361 |                 elif self.__ui_state == UiState.DOCUMENT:
362 | 
363 |                     inner_screen = self.__get_inner_screen(width, height)
364 |                     url: str = self.__view__document.urls[0] if self.__view__document is not None and self.__view__document.urls else ""
365 |                     display_url: str = BaseCursesView.url_for_display(url)
366 |                     self.__view__document.set_focused(True)
367 |                     self.__view__document.draw_inner_header(stdscr, inner_screen, f"URL: {display_url}")
368 |                     self.__view__document.set_bounds(inner_screen)
369 |                     self.__view__document.render(stdscr)
370 |                     self.__view__document.draw_inner_footer(stdscr, inner_screen, f"")
371 | 
372 |                 else:
373 | 
374 |                     # aka self.__ui_state == UiState.SEARCH_INIT
375 |                     inner_screen = self.__get_inner_screen(width, height)
376 |                     self.__view__searchform.draw_inner_header(stdscr, inner_screen, "Search:")
377 |                     selected_sites = self.__view__searchform.get_selected_sites()
378 |                     first_hit = selected_sites[0] if selected_sites else None
379 |                     url: str = first_hit.urls[0] if first_hit is not None and first_hit.urls else ""
380 |                     display_url: str = BaseCursesView.url_for_display(url)
381 |                     self.__view__searchform.set_bounds(inner_screen)
382 |                     self.__view__searchform.render(stdscr)
383 |                     self.__view__searchform.draw_inner_footer(stdscr, inner_screen, f"Searching {display_url}")
384 | 
385 |                 if height > LAYOUT_MIN_HEIGHT_FOR_HELP:
386 |                     help_text = self.__get_help_text()
387 |                     self.__view__searchform.draw_outer_header(stdscr)
388 |                     self.__view__searchform.draw_outer_footer(stdscr, help_text)
389 | 
390 |                 self.__render_debug(stdscr)
391 |                 stdscr.refresh()
392 | 
393 |                 key: int = stdscr.getch()
394 |                 if key == -1:               # timeout
395 |                     continue
396 |                 elif key == ord('\t'):
397 |                     self.__handle_TAB()
398 |                 elif key == curses.KEY_F1:
399 |                     self.__handle_F1()
400 |                 elif key == 27:             # ESC
401 |                     self.__handle_ESC()
402 | 
403 |                 if self.__view__requirements.validated == False or self.__ui_state == UiState.REQUIREMENTS:
404 |                     if self.__view__requirements.handle_input(key):
405 |                         continue
406 |                 elif self.__ui_state == UiState.SEARCH_INIT or (
407 |                         self.__ui_state == UiState.SEARCH_RESULTS
408 |                         and self.__ui_focused == UiFocusable.SEARCH_FORM
409 |                     ):
410 |                     if self.__view__searchform.handle_input(key):
411 |                         continue
412 |                 elif self.__ui_state == UiState.SEARCH_RESULTS:
413 |                     if self.__view__results.handle_input(key):
414 |                         continue
415 |                 elif self.__ui_state == UiState.DOCUMENT:
416 |                     if self.__view__document.handle_input(key):
417 |                         continue
418 |                 elif self.__ui_state == UiState.HELP:
419 |                     if self.__view__help.handle_input(key):
420 |                         continue
421 | 
422 |         except Exception as ex:
423 |             print(f"--interactive failure - {ex}\n{traceback.format_exc()}")
424 |             pass
425 |         finally:
426 |             stdscr.timeout(-1)
427 | 
428 |     def __render_debug(self, stdscr: curses.window) -> None:
429 |         """
430 |         Render debug info with adaptive sizing - compact for short messages, expanded for errors.
431 |         """
432 |         height, width = stdscr.getmaxyx()
433 | 
434 |         with threading.Lock():
435 |             debug_lines = self.__debug[-(DEBUG_MAX_LINES):].copy()
436 | 
437 |         if not debug_lines:
438 |             return
439 | 
440 |         max_line_length = max(len(line) for line in debug_lines) if debug_lines else 0
441 |         compact_width = max(int(width * DEBUG_COMPACT_WIDTH_RATIO), DEBUG_MIN_COMPACT_WIDTH)
442 |         use_expanded = max_line_length > compact_width - DEBUG_COMPACT_THRESHOLD
443 | 
444 |         if use_expanded:
445 |             debug_width: int = width - DEBUG_EXPANDED_MARGIN
446 |             debug_start_x: int = DEBUG_EXPANDED_START_X
447 |             debug_start_y: int = max(DEBUG_MIN_START_Y_EXPANDED, height - len(debug_lines) - DEBUG_EXPANDED_BOTTOM_MARGIN)
448 |         else:
449 |             debug_width: int = compact_width
450 |             debug_start_x: int = width - debug_width - DEBUG_EXPANDED_START_X
451 |             debug_start_y: int = height - len(debug_lines) - DEBUG_COMPACT_BOTTOM_MARGIN
452 | 
453 |         debug_start_y: int = max(DEBUG_MIN_START_Y, debug_start_y)
454 |         debug_start_x: int = max(0, debug_start_x)
455 | 
456 |         for i, debug_line in enumerate(debug_lines):
457 |             y_pos: int = debug_start_y + i
458 |             if y_pos >= height - 1:
459 |                 break
460 |             if debug_start_x >= 0 and y_pos > 0:
461 |                 display_line: str = debug_line[:debug_width]
462 |                 safe_addstr(stdscr, y_pos, debug_start_x, display_line, self.get_theme_color_pair(ThemeDefinition.HEADER_ACTIVE))
463 | 
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/templates/tests.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.templates.tests &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="../../../index.html">mcp-server-webcrawl</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 65 |           <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
 66 |       <li class="breadcrumb-item active">mcp_server_webcrawl.templates.tests</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <h1>Source code for mcp_server_webcrawl.templates.tests</h1><div class="highlight"><pre>
 76 | <span></span><span class="kn">import</span> <span class="nn">re</span>
 77 | <span class="kn">import</span> <span class="nn">unittest</span>
 78 | 
 79 | <span class="kn">from</span> <span class="nn">importlib</span> <span class="kn">import</span> <span class="n">resources</span>
 80 | <span class="kn">from</span> <span class="nn">urllib.request</span> <span class="kn">import</span> <span class="n">urlopen</span>
 81 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.extras.markdown</span> <span class="kn">import</span> <span class="n">get_markdown</span>
 83 | 
 84 | <span class="n">logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 85 | 
 86 | <div class="viewcode-block" id="TemplateTests">
 87 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests">[docs]</a>
 88 | <span class="k">class</span> <span class="nc">TemplateTests</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
 89 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 90 | <span class="sd">    Test suite for the custom HTML to markdown converter.</span>
 91 | <span class="sd">    Why custom? It&#39;s a bit faster, that is the only reason.</span>
 92 | <span class="sd">    Maximum load is 100 transforms (1 per result for a max result </span>
 93 | <span class="sd">    of 100), so speed matters. A default set is 20.</span>
 94 | <span class="sd">    This converter does a few things differently to tailor to LLM</span>
 95 | <span class="sd">    interaction.</span>
 96 | <span class="sd">    * aggressively removes images (html2text selectively renders)</span>
 97 | <span class="sd">    * links with block decendents will render like a &lt;p&gt; </span>
 98 | <span class="sd">        (html2text treats as &lt;a&gt;&lt;br&gt;)    </span>
 99 | <span class="sd">    &quot;&quot;&quot;</span>
100 | 
101 | <div class="viewcode-block" id="TemplateTests.setUp">
102 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests.setUp">[docs]</a>
103 |     <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
104 | <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
105 | <span class="sd">        Set up the test environment with fixture data.</span>
106 | <span class="sd">        &quot;&quot;&quot;</span>
107 |         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">setUp</span><span class="p">()</span></div>
108 | 
109 | 
110 | <div class="viewcode-block" id="TemplateTests.test_core_html">
111 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.templates.html#mcp_server_webcrawl.templates.tests.TemplateTests.test_core_html">[docs]</a>
112 |     <span class="k">def</span> <span class="nf">test_core_html</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
113 |         <span class="n">core_html</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">resources</span><span class="o">.</span><span class="n">read_text</span><span class="p">(</span><span class="s2">&quot;mcp_server_webcrawl.templates&quot;</span><span class="p">,</span> <span class="s2">&quot;tests_core.html&quot;</span><span class="p">)</span>
114 |         <span class="n">markdown</span> <span class="o">=</span> <span class="n">get_markdown</span><span class="p">(</span><span class="n">core_html</span><span class="p">)</span>
115 | 
116 |         <span class="c1"># h1-6</span>
117 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;# Lorem Ipsum Dolor Sit Amet&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
118 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;## Consectetur Adipiscing Elit&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
119 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;### Nemo Enim Ipsam Voluptatem&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
120 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;#### Sed Quia Non Numquam&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
121 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;##### Nisi Ut Aliquid Ex Ea&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
122 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;###### At Vero Eos Et Accusamus&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
123 | 
124 |         <span class="c1"># no content loss - key phrases should be preserved</span>
125 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;Lorem ipsum dolor sit amet&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
126 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;Definition List Example&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
127 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;More Text Elements&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
128 | 
129 |         <span class="c1"># inline formatting (proper spacing)</span>
130 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;amet, **consectetur adipiscing elit**. Sed&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
131 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;laborum. **Sed ut perspiciatis** unde&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
132 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;consequat. *Duis aute irure dolor* in&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
133 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;laudantium. *Totam rem aperiam*, eaque&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
134 | 
135 |         <span class="c1"># link formatting (proper spacing)</span>
136 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;veniam, quis nostrud exercitation ullamco&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>  <span class="c1"># Fragment links as plain text</span>
137 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;and a link back to top. Nam&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
138 | 
139 |         <span class="c1"># list formatting</span>
140 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;* Similique sunt in culpa&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
141 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;1. Temporibus autem quibusdam&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
142 | 
143 |         <span class="c1"># dl/dt</span>
144 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;**Lorem Ipsum**&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
145 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;    Dolor sit amet, consectetur adipiscing elit&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
146 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;**Ut Enim**&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
147 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;    Ad minim veniam, quis nostrud exercitation&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
148 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;**Duis Aute**&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
149 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;    Irure dolor in reprehenderit in voluptate&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
150 | 
151 |         <span class="c1"># table structure</span>
152 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;| Lorem | Ipsum | Dolor | Sit |&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
153 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;|---|---|---|---|&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
154 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;| Consectetur | Adipiscing | Elit | Sed |&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
155 | 
156 |         <span class="c1"># code formatting</span>
157 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;Here we have some `inline code` and&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
158 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;```</span><span class="se">\n</span><span class="s2">function lorem() {</span><span class="se">\n</span><span class="s2">    return </span><span class="se">\&quot;</span><span class="s2">ipsum dolor sit amet</span><span class="se">\&quot;</span><span class="s2">;</span><span class="se">\n</span><span class="s2">}</span><span class="se">\n</span><span class="s2">```&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
159 | 
160 |         <span class="c1"># blockquotes</span>
161 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;&gt; </span><span class="se">\&quot;</span><span class="s2">Sed ut perspiciatis unde omnis iste natus&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
162 | 
163 |         <span class="c1"># horizontal rule</span>
164 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">&quot;---&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>
165 | 
166 |         <span class="c1"># no double spacing for inline elements</span>
167 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">&quot;**  &quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>  <span class="c1"># No double spaces after bold</span>
168 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">&quot;  **&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>  <span class="c1"># No double spaces before bold</span>
169 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">&quot;*  &quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>   <span class="c1"># No double spaces after emphasis</span>
170 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertNotIn</span><span class="p">(</span><span class="s2">&quot;  *&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">)</span>   <span class="c1"># No double spaces before emphasis</span>
171 | 
172 |         <span class="c1"># structural integrity - count major elements</span>
173 |         <span class="n">heading_count</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;^#{1,6} &quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">MULTILINE</span><span class="p">))</span>
174 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">heading_count</span><span class="p">,</span> <span class="mi">11</span><span class="p">,</span> <span class="s2">&quot;Should have exactly 6 headings&quot;</span><span class="p">)</span>
175 |         <span class="n">table_count</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;^\|.*\|$&quot;</span><span class="p">,</span> <span class="n">markdown</span><span class="p">,</span> <span class="n">re</span><span class="o">.</span><span class="n">MULTILINE</span><span class="p">))</span>
176 |         <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">table_count</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="s2">&quot;Should have multiple table rows&quot;</span><span class="p">)</span></div>
177 | </div>
178 | 
179 | 
180 | </pre></div>
181 | 
182 |            </div>
183 |           </div>
184 |           <footer>
185 | 
186 |   <hr/>
187 | 
188 |   <div role="contentinfo">
189 |     <p>&#169; Copyright 2025, pragmar.</p>
190 |   </div>
191 | 
192 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
193 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
194 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
195 |    
196 | 
197 | </footer>
198 |         </div>
199 |       </div>
200 |     </section>
201 |   </div>
202 |   <script>
203 |       jQuery(function () {
204 |           SphinxRtdTheme.Navigation.enable(true);
205 |       });
206 |   </script> 
207 | 
208 | </body>
209 | </html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/interactive/views/searchform.py:
--------------------------------------------------------------------------------

```python
  1 | import curses
  2 | 
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | from mcp_server_webcrawl.interactive.ui import (
  6 |     UiState, InputRadio, InputRadioGroup, InputText,
  7 |     ThemeDefinition, NavigationDirection, safe_addstr
  8 | )
  9 | from mcp_server_webcrawl.interactive.views.base import BaseCursesView
 10 | from mcp_server_webcrawl.models.sites import SiteResult
 11 | from mcp_server_webcrawl.interactive.ui import safe_addstr
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from mcp_server_webcrawl.interactive.session import InteractiveSession
 15 | 
 16 | LAYOUT_QUERY_MAX_WIDTH = 50
 17 | LAYOUT_QUERY_MARGIN = 11
 18 | LAYOUT_QUERY_OFFSET = 9
 19 | LAYOUT_FILTER_COLUMN_PADDING = 8
 20 | LAYOUT_SORT_COLUMN_PADDING = 6
 21 | LAYOUT_FILTER_TO_SORT_SPACING = 8
 22 | LAYOUT_SORT_TO_SITES_SPACING = 6
 23 | LAYOUT_SITE_COLUMN_WIDTH = 22
 24 | LAYOUT_SITE_COLUMN_SPACING = 2
 25 | LAYOUT_SITES_VERTICAL_OFFSET = 6
 26 | LAYOUT_SITES_MIN_WIDTH_REQUIREMENT = 16
 27 | LAYOUT_CONSTRAINED_SITES_PER_COLUMN = 3
 28 | LAYOUT_TRUNCATED_LABEL_MAX_LENGTH = 18
 29 | LAYOUT_OVERFLOW_INDICATOR_MARGIN = 2
 30 | 
 31 | class SearchFormNavigationGrid:
 32 |     def __init__(self, ui_state: UiState, filter_group: InputRadioGroup, sort_group: InputRadioGroup,
 33 |             sites_group: InputRadioGroup, sites_per_column: int):
 34 |         """
 35 |         Create virtual grid for navigation:
 36 |         query(0) 
 37 |         filter0  sort0    site0    site3    site6
 38 |         filter1  sort1    site1    site4    site7
 39 |                  sort2    site2    site5    site8+            
 40 |         """
 41 |         self.__grid: dict[tuple[int, int], int] = {}
 42 |         self.__reverse_grid: dict[int, tuple[int, int]] = {}
 43 | 
 44 |         # query spans columns 0-2, row 0
 45 |         for col in range(3):
 46 |             self.__grid[(0, col)] = 0
 47 |         self.__reverse_grid[0] = (0, 0)
 48 | 
 49 |         for i, _ in enumerate(filter_group.radios):
 50 |             row = 1 + i
 51 |             index = 1 + i  # filter indices start at 1
 52 |             self.__grid[(row, 0)] = index
 53 |             self.__reverse_grid[index] = (row, 0)
 54 | 
 55 |         sort_start_index = 1 + len(filter_group.radios)
 56 |         for i, _ in enumerate(sort_group.radios):
 57 |             row = 1 + i
 58 |             index = sort_start_index + i
 59 |             self.__grid[(row, 1)] = index
 60 |             self.__reverse_grid[index] = (row, 1)
 61 | 
 62 |         sites_start_index = 1 + len(filter_group.radios) + len(sort_group.radios)
 63 |         self.__ui_state = ui_state
 64 | 
 65 |         for i, _ in enumerate(sites_group.radios):
 66 |             row = 1 + (i % sites_per_column)
 67 |             col = 2 + (i // sites_per_column)
 68 |             index = sites_start_index + i
 69 |             self.__grid[(row, col)] = index
 70 |             self.__reverse_grid[index] = (row, col)
 71 | 
 72 |     def __rightmost_column(self, row: int) -> int:
 73 |         """
 74 |         Get the rightmost column that has content in the given row.
 75 |         """
 76 |         max_col = -1
 77 |         for (r, c) in self.__grid.keys():
 78 |             if r == row:
 79 |                 max_col = max(max_col, c)
 80 |         return max_col
 81 | 
 82 |     def __leftmost_column(self, row: int) -> int:
 83 |         """
 84 |         Get the leftmost column that has content in the given row.
 85 |         """
 86 |         min_col = float('inf')
 87 |         for (r, c) in self.__grid.keys():
 88 |             if r == row:
 89 |                 min_col = min(min_col, c)
 90 |         return min_col if min_col != float('inf') else -1
 91 | 
 92 |     def left(self, current_index: int) -> int | None:
 93 |         """
 94 |         Navigate left from current index. Wraps to rightmost element if at left edge.
 95 |         """
 96 |         if current_index not in self.__reverse_grid:
 97 |             return None
 98 | 
 99 |         row, col = self.__reverse_grid[current_index]
100 | 
101 |         # move normally if destination exists
102 |         if col > 0:
103 |             new_pos = (row, col - 1)
104 |             if new_pos in self.__grid:
105 |                 return self.__grid[new_pos]
106 | 
107 |         # wrap on edge
108 |         rightmost_col = self.__rightmost_column(row)
109 |         if rightmost_col >= 0 and rightmost_col != col:
110 |             wrap_pos = (row, rightmost_col)
111 |             return self.__grid.get(wrap_pos)
112 | 
113 |         return None
114 | 
115 |     def right(self, current_index: int) -> int | None:
116 |         """
117 |         Navigate right from current index. Wraps to leftmost element if at right edge.
118 |         """
119 |         if current_index not in self.__reverse_grid:
120 |             return None
121 | 
122 |         row, col = self.__reverse_grid[current_index]
123 | 
124 |         # move normally if destination exists
125 |         new_pos = (row, col + 1)
126 |         if new_pos in self.__grid:
127 |             return self.__grid[new_pos]
128 | 
129 |         # wrap on edge
130 |         leftmost_col = self.__leftmost_column(row)
131 |         if leftmost_col >= 0 and leftmost_col != col:
132 |             wrap_pos = (row, leftmost_col)
133 |             return self.__grid.get(wrap_pos)
134 | 
135 |         return None
136 | 
137 |     def up(self, current_index: int) -> int | None:
138 |         """
139 |         Navigate up from current index. From any radio column goes to query(0).
140 |         """
141 |         if current_index not in self.__reverse_grid:
142 |             return None
143 | 
144 |         row, col = self.__reverse_grid[current_index]
145 |         if row == 0:
146 |             return None
147 |         if row == 1:
148 |             return 0
149 | 
150 |         # otherwise, move up normally
151 |         if row > 1:
152 |             new_pos = (row - 1, col)
153 |             return self.__grid.get(new_pos)
154 | 
155 |         return None
156 | 
157 |     def down(self, current_index: int) -> int | None:
158 |         """
159 |         Navigate down from current index.
160 |         """
161 |         if current_index not in self.__reverse_grid:
162 |             return None
163 | 
164 |         # In SEARCH_INIT mode, advance by one
165 |         if self.__ui_state == UiState.SEARCH_INIT:
166 |             return current_index + 1 if current_index + 1 in self.__reverse_grid else None
167 | 
168 |         row, col = self.__reverse_grid[current_index]
169 |         new_pos = (row + 1, col)
170 |         return self.__grid.get(new_pos)
171 | 
172 | 
173 | class SearchFormView(BaseCursesView):
174 |     """
175 |     Handles search form state and rendering.
176 |     Takes over all the form_* properties and methods from session.
177 |     """
178 | 
179 |     def __init__(self, session: 'InteractiveSession', sites: list[SiteResult]):
180 |         """
181 |         Initialize the search form view.
182 |         
183 |         Args:
184 |             session: The interactive session instance
185 |             sites: List of available sites for selection
186 |         """
187 |         super().__init__(session)
188 |         self.__search_attempted: bool = False
189 |         self.__sites: list[SiteResult] = sites
190 |         self.__sites_selected: list[SiteResult] = []
191 |         self.__query_input = InputText(initial_value="", label="Query")
192 |         self.__limit = 10
193 |         self.__offset = 0
194 | 
195 |         if sites:
196 |             self.__sites_selected.append(self.__sites[0])
197 | 
198 |         self.__filter_group: InputRadioGroup = InputRadioGroup("filter")
199 |         self.__sort_group: InputRadioGroup = InputRadioGroup("sort")
200 |         self.__sites_group: InputRadioGroup = InputRadioGroup("site", sites=self.__sites)
201 | 
202 |     @property
203 |     def filter(self) -> str:
204 |         return self.__filter_group.value
205 | 
206 |     @property
207 |     def limit(self) -> str:
208 |         return self.__limit
209 | 
210 |     @property
211 |     def offset(self) -> str:
212 |         return self.__offset
213 | 
214 |     @property
215 |     def query(self) -> str:
216 |         return self.__query_input.value
217 | 
218 |     @property
219 |     def sort(self) -> str:
220 |         return self.__sort_group.value.lower() if self.__sort_group.value is not None else "+url"
221 | 
222 |     def clear_query(self) -> None:
223 |         """
224 |         Clear only the query, preserve selections (was session method).
225 |         """
226 |         self.__search_attempted = False
227 |         self.__query_input.clear()
228 |         self._selected_index = 0
229 |         self.__offset = 0
230 | 
231 |     def focus(self):
232 |         """
233 |         Set focus on this view.
234 |         """
235 |         self._focused = True
236 | 
237 |     def get_selected_sites(self) -> list[SiteResult]:
238 |         return self.__sites_selected.copy()
239 | 
240 |     def handle_input(self, key: int) -> bool:
241 |         """
242 |         Handle keyboard input and trigger search when state changes.
243 |         
244 |         Args:
245 |             key: The curses key code from user input
246 |             
247 |         Returns:
248 |             bool: True if the input was handled, False otherwise
249 |         """
250 | 
251 |         handlers: dict[int, callable] = {
252 |             curses.KEY_UP: lambda: self.__navigate_form_selection(NavigationDirection.UP),
253 |             curses.KEY_DOWN: lambda: self.__navigate_form_selection(NavigationDirection.DOWN),
254 |             curses.KEY_LEFT: lambda: self.__handle_horizontal_arrow(NavigationDirection.LEFT),
255 |             curses.KEY_RIGHT: lambda: self.__handle_horizontal_arrow(NavigationDirection.RIGHT),
256 |             ord(' '): self.__handle_spacebar,
257 |             ord('\n'): self.__handle_enter,
258 |             ord('\r'): self.__handle_enter,
259 |         }
260 | 
261 |         handler = handlers.get(key)
262 |         if handler:
263 |             handler()
264 |             return True
265 | 
266 |         if self._selected_index == 0:
267 |             if self.__query_input.handle_input(key):
268 |                 self.session.searchman.autosearch()
269 |                 return True
270 | 
271 |         return False
272 | 
273 |     def page_next(self, total_results: int) -> bool:
274 |         """
275 |         Navigate to next page.
276 |         
277 |         Args:
278 |             total_results: Total number of search results available
279 |             
280 |         Returns:
281 |             bool: True if page was changed, False otherwise
282 |         """
283 |         if self.__offset + self.__limit < total_results:
284 |             self.__offset += self.__limit
285 |             return True
286 |         return False
287 | 
288 |     def page_previous(self) -> bool:
289 |         """
290 |         Navigate to previous page.
291 |         
292 |         Returns:
293 |             bool: True if page was changed, False otherwise
294 |         """
295 |         if self.__offset >= self.__limit:
296 |             self.__offset -= self.__limit
297 |             return True
298 |         return False
299 | 
300 |     def render(self, stdscr: curses.window) -> None:
301 |         """
302 |         Render the search form with multi-column sites layout.
303 |         """
304 |         xb: int = self.bounds.x
305 |         yb: int = self.bounds.y
306 |         y_current: int = yb + 2  # y start
307 |         y_max: int = yb + self.bounds.height
308 | 
309 |         if not self._renderable(stdscr):
310 |             return
311 | 
312 |         safe_addstr(stdscr, y_current, xb + 2, "Query:")
313 | 
314 |         box_width = min(LAYOUT_QUERY_MAX_WIDTH, self.bounds.width - LAYOUT_QUERY_MARGIN)
315 |         is_query_selected = (self._focused and self._selected_index == 0)
316 | 
317 |         self.__query_input.render(stdscr, y_current, xb + LAYOUT_QUERY_OFFSET, box_width,
318 |                 focused=is_query_selected, style=self._get_input_style())
319 | 
320 |         y_current += 2
321 |         if y_current >= y_max:
322 |             return
323 | 
324 |         # radio column layout - calculated positions based on content width
325 |         filter_column_width = self.__filter_group.calculate_group_width() + LAYOUT_FILTER_COLUMN_PADDING
326 |         sort_column_width = self.__sort_group.calculate_group_width() + LAYOUT_SORT_COLUMN_PADDING
327 |         sort_start_x = filter_column_width + LAYOUT_FILTER_TO_SORT_SPACING
328 |         sites_start_x = sort_start_x + sort_column_width + LAYOUT_SORT_TO_SITES_SPACING
329 | 
330 |         safe_addstr(stdscr, y_current, xb + 2, self.__filter_group.label)
331 |         safe_addstr(stdscr, y_current, xb + sort_start_x, self.__sort_group.label)
332 |         if sites_start_x + LAYOUT_SITES_MIN_WIDTH_REQUIREMENT < self.bounds.width:
333 |             safe_addstr(stdscr, y_current, xb + sites_start_x, self.__sites_group.label)
334 |             if not self.__sites:
335 |                 error_style = self.session.get_theme_color_pair(ThemeDefinition.UI_ERROR)
336 |                 safe_addstr(stdscr, y_current + 1, xb + sites_start_x, "No sites available", error_style)
337 | 
338 |         y_current += 1
339 | 
340 |         available_width = self.bounds.width - sites_start_x - 4
341 |         is_constrained = self.session.ui_state == UiState.SEARCH_RESULTS
342 |         sites_per_column = (LAYOUT_CONSTRAINED_SITES_PER_COLUMN if is_constrained
343 |                            else min(self.bounds.height - LAYOUT_SITES_VERTICAL_OFFSET, len(self.__sites_group.radios)))
344 |         max_columns = (max(1, available_width // (LAYOUT_SITE_COLUMN_WIDTH + LAYOUT_SITE_COLUMN_SPACING))
345 |                       if available_width > LAYOUT_SITE_COLUMN_WIDTH else 1)
346 |         total_visible_sites = max_columns * sites_per_column
347 |         overflow_count = max(0, len(self.__sites_group.radios) - total_visible_sites)
348 |         max_rows = max(len(self.__filter_group.radios), len(self.__sort_group.radios), sites_per_column)
349 | 
350 |         for i in range(max_rows):
351 | 
352 |             if y_current >= y_max:
353 |                 return
354 | 
355 |             # filter radios
356 |             if i < len(self.__filter_group.radios):
357 |                 filter_radio: InputRadio = self.__filter_group.radios[i]
358 |                 field_index: int = 1 + i
359 |                 is_selected: bool = self._selected_index == field_index
360 |                 filter_radio.render(stdscr, y_current, xb + 2, field_index, 100, is_selected)
361 | 
362 |             # sorts radios
363 |             if i < len(self.__sort_group.radios):
364 |                 sort_radio: InputRadio = self.__sort_group.radios[i]
365 |                 field_index: int = 1 + len(self.__filter_group.radios) + i
366 |                 is_selected: bool = self._selected_index == field_index
367 |                 sort_radio.render(stdscr, y_current, xb + sort_start_x, field_index, 100, is_selected)
368 | 
369 |             # sites radios - multiple columns
370 |             if sites_start_x + LAYOUT_SITES_MIN_WIDTH_REQUIREMENT < self.bounds.width:
371 |                 for col in range(max_columns):
372 |                     site_index = col * sites_per_column + i
373 |                     if site_index < len(self.__sites_group.radios) and site_index < total_visible_sites:
374 |                         site_radio: InputRadio = self.__sites_group.radios[site_index]
375 |                         field_index: int = 1 + len(self.__sort_group.radios) + len(self.__filter_group.radios) + site_index
376 |                         is_selected: bool = self._selected_index == field_index
377 |                         col_x = sites_start_x + col * (LAYOUT_SITE_COLUMN_WIDTH + LAYOUT_SITE_COLUMN_SPACING)
378 |                         original_label = site_radio.label
379 |                         site_radio.label = self.__truncate_label(original_label)
380 |                         site_radio.render(stdscr, y_current, xb + col_x, field_index, LAYOUT_TRUNCATED_LABEL_MAX_LENGTH, is_selected)
381 |                         site_radio.label = original_label  # restore original label
382 | 
383 |             # overflow indicator on last row, right-aligned
384 |             if (overflow_count > 0 and i == sites_per_column - 1 and
385 |                 sites_start_x + LAYOUT_SITES_MIN_WIDTH_REQUIREMENT < self.bounds.width):
386 |                 overflow_text: str = f"+{overflow_count} more"
387 |                 overflow_x: int = self.bounds.width - len(overflow_text) - LAYOUT_OVERFLOW_INDICATOR_MARGIN
388 |                 try:
389 |                     safe_addstr(stdscr, y_current, overflow_x, overflow_text, curses.A_DIM)
390 |                 except curses.error:
391 |                     pass
392 | 
393 |             y_current += 1
394 | 
395 |     def set_search_attempted(self) -> None:
396 |         """
397 |         Set attempted to True.
398 |         """
399 |         self.__search_attempted = True
400 | 
401 |     def unfocus(self):
402 |         """
403 |         Remove focus from this view.
404 |         """
405 |         self._focused = False
406 | 
407 |     def __get_sites_per_column(self) -> int:
408 |         """
409 |         Handle left arrow key navigation.
410 |         """
411 |         is_constrained = self.session.ui_state == UiState.SEARCH_RESULTS
412 |         return (LAYOUT_CONSTRAINED_SITES_PER_COLUMN if is_constrained
413 |                            else min(self.bounds.height - LAYOUT_SITES_VERTICAL_OFFSET, len(self.__sites_group.radios)))
414 | 
415 |     def __handle_enter(self) -> None:
416 |         """
417 |         Handle ENTER key - only toggles radio buttons, doesn't affect query field.
418 |         """
419 | 
420 |         if self._selected_index == 0:           # query field
421 |             self.session.searchman.autosearch()
422 |         else:                                   # radios
423 |             self.__handle_radio_toggle()
424 |             if self.session.ui_state != UiState.SEARCH_INIT:
425 |                 self.session.searchman.autosearch(immediate=True)
426 | 
427 |     def __handle_horizontal_arrow(self, direction: NavigationDirection) -> None:
428 |         """
429 |         Handle left/right arrow navigation using the directional grid.
430 |         
431 |         Args:
432 |             direction: The navigation direction (LEFT or RIGHT)
433 |         """
434 |         if self.session.ui_state is None:
435 |             return
436 | 
437 |         # query field handles cursor movement internally
438 |         if self._selected_index == 0:
439 |             if direction == NavigationDirection.LEFT:
440 |                 self.__query_input.move_cursor_left()
441 |             else:
442 |                 self.__query_input.move_cursor_right()
443 |             return
444 | 
445 |         # use grid navigation for all other fields
446 |         grid = SearchFormNavigationGrid(self.session.ui_state, self.__filter_group, self.__sort_group,
447 |                 self.__sites_group, self.__get_sites_per_column())
448 |         if direction == NavigationDirection.LEFT:
449 |             new_index = grid.left(self._selected_index)
450 |         else:
451 |             new_index = grid.right(self._selected_index)
452 | 
453 |         if new_index is not None:
454 |             self._selected_index = new_index
455 | 
456 |     def __handle_radio_toggle(self) -> None:
457 |         """
458 |         Handle radio button toggles for filters and sites.
459 |         """
460 |         filter_index_start: int = 1
461 |         sorts_index_start: int = filter_index_start + len(self.__filter_group.radios)
462 |         sites_index_start: int = sorts_index_start + len(self.__sort_group.radios)
463 | 
464 |         if self._selected_index >= filter_index_start and self._selected_index < sorts_index_start:
465 |             filter_index = self._selected_index - filter_index_start
466 |             filter_input: InputRadio = self.__filter_group.radios[filter_index]
467 |             filter_input.next_state()
468 |         elif self._selected_index >= sorts_index_start and self._selected_index < sites_index_start:
469 |             sort_index = self._selected_index - sorts_index_start
470 |             sort_input: InputRadio = self.__sort_group.radios[sort_index]
471 |             sort_input.next_state()
472 |         elif self._selected_index >= sites_index_start:
473 |             site_index = self._selected_index - sites_index_start
474 |             if site_index < len(self.__sites) and site_index < len(self.__sites_group.radios):
475 |                 site_input: InputRadio = self.__sites_group.radios[site_index]
476 |                 site_input.next_state()
477 |                 self.__sites_selected = [self.__sites[site_index]]
478 | 
479 |     def __handle_spacebar(self) -> None:
480 |         """
481 |         Handle spacebar for toggles. Updated for new field order: Query, Filters, Sites.
482 |         """
483 |         if self._selected_index == 0:               # query field
484 |             self.__query_input.insert_char(" ")
485 |             self.session.searchman.autosearch()
486 |         else:                                       # radios
487 |             self.__handle_radio_toggle()
488 |             if self.session.ui_state != UiState.SEARCH_INIT:
489 |                 self.session.searchman.autosearch()
490 | 
491 |     def __navigate_form_selection(self, direction: NavigationDirection) -> None:
492 |         """
493 |         Navigate between form fields. Updated for new field order: Query, Filters, Sites.
494 | 
495 |         Args:
496 |             direction: The navigation direction (UP or DOWN)
497 |         """
498 |         # query(0), filters(1-2), sorts(3-5), sites(...)
499 |         last_field_index = 5 + len(self.__sites)
500 |         if direction == NavigationDirection.UP:
501 |             if self._selected_index == 0:
502 |                 self._selected_index = last_field_index
503 |             else:
504 |                 self._selected_index -= 1
505 |         elif direction == NavigationDirection.DOWN:
506 |             if self._selected_index == last_field_index:
507 |                 self._selected_index = 0
508 |             else:
509 |                 self._selected_index += 1
510 | 
511 |     def __truncate_label(self, label: str, max_length: int = LAYOUT_TRUNCATED_LABEL_MAX_LENGTH) -> str:
512 |         """
513 |         Truncate label to max_length, replacing last char with ellipsis if needed.
514 |         
515 |         Args:
516 |             label: The label text to truncate
517 |             max_length: Maximum allowed length for the label
518 |             
519 |         Returns:
520 |             str: The truncated label with ellipsis if needed
521 |         """
522 |         if len(label) <= max_length:
523 |             return label
524 |         return label[:max_length - 1] + "…"
525 | 
```

--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/extras/regex.html:
--------------------------------------------------------------------------------

```html
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <html class="writer-html5" lang="en" data-content_root="../../../">
  5 | <head>
  6 |   <meta charset="utf-8" />
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  8 |   <title>mcp_server_webcrawl.extras.regex &mdash; mcp-server-webcrawl  documentation</title>
  9 |       <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" />
 10 |       <link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" />
 11 | 
 12 |   
 13 |       <script src="../../../_static/jquery.js?v=5d32c60e"></script>
 14 |       <script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
 15 |       <script src="../../../_static/documentation_options.js?v=5929fcd5"></script>
 16 |       <script src="../../../_static/doctools.js?v=888ff710"></script>
 17 |       <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
 18 |     <script src="../../../_static/js/theme.js"></script>
 19 |     <link rel="index" title="Index" href="../../../genindex.html" />
 20 |     <link rel="search" title="Search" href="../../../search.html" /> 
 21 | </head>
 22 | 
 23 | <body class="wy-body-for-nav"> 
 24 |   <div class="wy-grid-for-nav">
 25 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
 26 |       <div class="wy-side-scroll">
 27 |         <div class="wy-side-nav-search" >
 28 | 
 29 |           
 30 |           
 31 |           <a href="../../../index.html" class="icon icon-home">
 32 |             mcp-server-webcrawl
 33 |           </a>
 34 | <div role="search">
 35 |   <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
 36 |     <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
 37 |     <input type="hidden" name="check_keywords" value="yes" />
 38 |     <input type="hidden" name="area" value="default" />
 39 |   </form>
 40 | </div>
 41 |         </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
 42 |               <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
 43 | <ul>
 44 | <li class="toctree-l1"><a class="reference internal" href="../../../installation.html">Installation</a></li>
 45 | <li class="toctree-l1"><a class="reference internal" href="../../../guides.html">Setup Guides</a></li>
 46 | <li class="toctree-l1"><a class="reference internal" href="../../../usage.html">Usage</a></li>
 47 | <li class="toctree-l1"><a class="reference internal" href="../../../prompts.html">Prompt Routines</a></li>
 48 | <li class="toctree-l1"><a class="reference internal" href="../../../modules.html">mcp_server_webcrawl</a></li>
 49 | </ul>
 50 | 
 51 |         </div>
 52 |       </div>
 53 |     </nav>
 54 | 
 55 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
 56 |           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 57 |           <a href="../../../index.html">mcp-server-webcrawl</a>
 58 |       </nav>
 59 | 
 60 |       <div class="wy-nav-content">
 61 |         <div class="rst-content">
 62 |           <div role="navigation" aria-label="Page navigation">
 63 |   <ul class="wy-breadcrumbs">
 64 |       <li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
 65 |           <li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
 66 |       <li class="breadcrumb-item active">mcp_server_webcrawl.extras.regex</li>
 67 |       <li class="wy-breadcrumbs-aside">
 68 |       </li>
 69 |   </ul>
 70 |   <hr/>
 71 | </div>
 72 |           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
 73 |            <div itemprop="articleBody">
 74 |              
 75 |   <h1>Source code for mcp_server_webcrawl.extras.regex</h1><div class="highlight"><pre>
 76 | <span></span><span class="kn">import</span> <span class="nn">re</span>
 77 | 
 78 | <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">lru_cache</span>
 79 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
 80 | <span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
 81 | 
 82 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
 83 | 
 84 | <span class="n">__REGEX_PATTERNS_REGEX_HAZARDS</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="p">[</span>
 85 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*\*[^)]*\+&quot;</span><span class="p">,</span>                   <span class="c1"># (.*)*+, (.+)*+, etc.</span>
 86 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*\+[^)]*\*&quot;</span><span class="p">,</span>                   <span class="c1"># (.+)*., (.*)++, etc.</span>
 87 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*\+[^)]*\+&quot;</span><span class="p">,</span>                   <span class="c1"># (.+)+, (.++)+ etc.</span>
 88 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*\*[^)]*\*&quot;</span><span class="p">,</span>                   <span class="c1"># (.*)*, (.**) etc.</span>
 89 |     <span class="sa">r</span><span class="s2">&quot;\.\*.*\.\*&quot;</span><span class="p">,</span>                         <span class="c1"># .*.* patterns</span>
 90 |     <span class="sa">r</span><span class="s2">&quot;\.\+.*\.\+&quot;</span><span class="p">,</span>                         <span class="c1"># .+.+ patterns</span>
 91 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*\?\)\*&quot;</span><span class="p">,</span>                      <span class="c1"># (a?)* patterns</span>
 92 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*\?\)\+&quot;</span><span class="p">,</span>                      <span class="c1"># (a?)+ patterns</span>
 93 |     <span class="sa">r</span><span class="s2">&quot;\([^)]*[*+?][^)]*[*+?][^)]*\)[*+]&quot;</span><span class="p">,</span>  <span class="c1"># 2+ quantifiers inside, then quantifier outside</span>
 94 | <span class="p">]</span>
 95 | 
 96 | <span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
 97 | 
 98 | <span class="nd">@lru_cache</span><span class="p">(</span><span class="n">maxsize</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
 99 | <span class="k">def</span> <span class="nf">__get_compiled_hazard_patterns</span><span class="p">():</span>
100 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
101 | <span class="sd">    Lazy load compiled patterns</span>
102 | <span class="sd">    &quot;&quot;&quot;</span>
103 |     <span class="n">compiled_patterns</span> <span class="o">=</span> <span class="p">[]</span>
104 |     <span class="k">for</span> <span class="n">hazard</span> <span class="ow">in</span> <span class="n">__REGEX_PATTERNS_REGEX_HAZARDS</span><span class="p">:</span>
105 |         <span class="k">try</span><span class="p">:</span>
106 |             <span class="n">compiled_patterns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">hazard</span><span class="p">))</span>
107 |         <span class="k">except</span> <span class="n">re</span><span class="o">.</span><span class="n">error</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
108 |             <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid hazard pattern </span><span class="si">{</span><span class="n">hazard</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
109 |             <span class="k">continue</span>
110 |     <span class="k">return</span> <span class="n">compiled_patterns</span>
111 | 
112 | <span class="k">def</span> <span class="nf">__regex_is_hazardous</span><span class="p">(</span><span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
113 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
114 | <span class="sd">    Check if a regex pattern might cause catastrophic backtracking</span>
115 | <span class="sd">    or otherwise unacceptable performance over up to 100 HTML files</span>
116 | <span class="sd">    &quot;&quot;&quot;</span>
117 | 
118 |     <span class="n">compiled_hazards</span> <span class="o">=</span> <span class="n">__get_compiled_hazard_patterns</span><span class="p">()</span>
119 | 
120 |     <span class="k">for</span> <span class="n">hazard_pattern</span> <span class="ow">in</span> <span class="n">compiled_hazards</span><span class="p">:</span>
121 |         <span class="k">try</span><span class="p">:</span>
122 |             <span class="k">if</span> <span class="n">hazard_pattern</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">pattern</span><span class="p">):</span>
123 |                 <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;hazardous regex discarded </span><span class="si">{</span><span class="n">pattern</span><span class="si">}</span><span class="s2"> matched </span><span class="si">{</span><span class="n">hazard_pattern</span><span class="o">.</span><span class="n">pattern</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
124 |                 <span class="k">return</span> <span class="kc">True</span>
125 |         <span class="k">except</span> <span class="n">re</span><span class="o">.</span><span class="n">error</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
126 |             <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error checking hazard pattern </span><span class="si">{</span><span class="n">hazard_pattern</span><span class="o">.</span><span class="n">pattern</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
127 |             <span class="k">continue</span>
128 | 
129 |     <span class="k">return</span> <span class="kc">False</span>
130 | 
131 | <div class="viewcode-block" id="get_regex">
132 | <a class="viewcode-back" href="../../../mcp_server_webcrawl.extras.html#mcp_server_webcrawl.extras.regex.get_regex">[docs]</a>
133 | <span class="k">def</span> <span class="nf">get_regex</span><span class="p">(</span><span class="n">headers</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">content</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">patterns</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span><span class="p">]]:</span>
134 | <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
135 | <span class="sd">    Takes headers and content and gets regex matches</span>
136 | 
137 | <span class="sd">    Arguments:</span>
138 | <span class="sd">        headers: The headers to search</span>
139 | <span class="sd">        content: The content to search</span>
140 | <span class="sd">        patterns: The regex patterns</span>
141 | 
142 | <span class="sd">    Returns:</span>
143 | <span class="sd">        A list of dicts, with selector, value, groups, position info, and source</span>
144 | <span class="sd">    &quot;&quot;&quot;</span>
145 | 
146 |     <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">content</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
147 |         <span class="n">content</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
148 |     <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">headers</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
149 |         <span class="n">headers</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
150 | 
151 |     <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">patterns</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">patterns</span><span class="p">):</span>
152 |         <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;patterns must be a list of strings&quot;</span><span class="p">)</span>
153 | 
154 |     <span class="n">results</span> <span class="o">=</span> <span class="p">[]</span>
155 | 
156 |     <span class="k">if</span> <span class="n">content</span> <span class="o">==</span> <span class="s2">&quot;&quot;</span> <span class="ow">and</span> <span class="n">headers</span> <span class="o">==</span> <span class="s2">&quot;&quot;</span><span class="p">:</span>
157 |         <span class="k">return</span> <span class="n">results</span>
158 | 
159 |     <span class="n">re_patterns</span> <span class="o">=</span> <span class="p">[]</span>
160 |     <span class="k">for</span> <span class="n">pattern</span> <span class="ow">in</span> <span class="n">patterns</span><span class="p">:</span>
161 |         <span class="k">if</span> <span class="n">__regex_is_hazardous</span><span class="p">(</span><span class="n">pattern</span><span class="p">):</span>
162 |             <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Hazardous regex pattern &#39;</span><span class="si">{</span><span class="n">pattern</span><span class="si">}</span><span class="s2">&#39;&quot;</span><span class="p">)</span>
163 |             <span class="k">continue</span>
164 | 
165 |         <span class="k">try</span><span class="p">:</span>
166 |             <span class="n">re_pattern</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">pattern</span><span class="p">)</span>
167 |             <span class="n">re_patterns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">re_pattern</span><span class="p">)</span>
168 |         <span class="k">except</span> <span class="n">re</span><span class="o">.</span><span class="n">error</span> <span class="k">as</span> <span class="n">ex</span><span class="p">:</span>
169 |             <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid regex pattern &#39;</span><span class="si">{</span><span class="n">pattern</span><span class="si">}</span><span class="s2">&#39;: </span><span class="si">{</span><span class="n">ex</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
170 |             <span class="k">continue</span>
171 | 
172 |     <span class="c1"># search headers and content</span>
173 |     <span class="n">search_targets</span> <span class="o">=</span> <span class="p">[(</span><span class="s2">&quot;headers&quot;</span><span class="p">,</span> <span class="n">headers</span><span class="p">),</span> <span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">,</span> <span class="n">content</span><span class="p">)]</span>
174 | 
175 |     <span class="k">for</span> <span class="n">re_pattern</span> <span class="ow">in</span> <span class="n">re_patterns</span><span class="p">:</span>
176 |         <span class="k">for</span> <span class="n">source_name</span><span class="p">,</span> <span class="n">search_text</span> <span class="ow">in</span> <span class="n">search_targets</span><span class="p">:</span>
177 |             <span class="k">if</span> <span class="ow">not</span> <span class="n">search_text</span><span class="p">:</span>
178 |                 <span class="k">continue</span>
179 | 
180 |             <span class="k">for</span> <span class="n">match</span> <span class="ow">in</span> <span class="n">re_pattern</span><span class="o">.</span><span class="n">finditer</span><span class="p">(</span><span class="n">search_text</span><span class="p">):</span>
181 |                 <span class="n">regex_hit</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span> <span class="o">|</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
182 |                     <span class="s2">&quot;selector&quot;</span><span class="p">:</span> <span class="n">re_pattern</span><span class="o">.</span><span class="n">pattern</span><span class="p">,</span>
183 |                     <span class="s2">&quot;value&quot;</span><span class="p">:</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span>
184 |                     <span class="s2">&quot;source&quot;</span><span class="p">:</span> <span class="n">source_name</span>  <span class="c1"># headers or content</span>
185 |                 <span class="p">}</span>
186 | 
187 |                 <span class="k">if</span> <span class="n">match</span><span class="o">.</span><span class="n">groups</span><span class="p">():</span>
188 |                     <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">group</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">groups</span><span class="p">(),</span> <span class="mi">1</span><span class="p">):</span>
189 |                         <span class="k">if</span> <span class="n">group</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
190 |                             <span class="n">regex_hit</span><span class="p">[</span><span class="sa">f</span><span class="s2">&quot;group_</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">group</span>
191 | 
192 |                 <span class="n">regex_hit</span><span class="p">[</span><span class="s2">&quot;start&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">start</span><span class="p">()</span>
193 |                 <span class="n">regex_hit</span><span class="p">[</span><span class="s2">&quot;end&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">end</span><span class="p">()</span>
194 |                 <span class="n">results</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">regex_hit</span><span class="p">)</span>
195 | 
196 |     <span class="k">return</span> <span class="n">results</span></div>
197 | 
198 | </pre></div>
199 | 
200 |            </div>
201 |           </div>
202 |           <footer>
203 | 
204 |   <hr/>
205 | 
206 |   <div role="contentinfo">
207 |     <p>&#169; Copyright 2025, pragmar.</p>
208 |   </div>
209 | 
210 |   Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
211 |     <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
212 |     provided by <a href="https://readthedocs.org">Read the Docs</a>.
213 |    
214 | 
215 | </footer>
216 |         </div>
217 |       </div>
218 |     </section>
219 |   </div>
220 |   <script>
221 |       jQuery(function () {
222 |           SphinxRtdTheme.Navigation.enable(true);
223 |       });
224 |   </script> 
225 | 
226 | </body>
227 | </html>
```

--------------------------------------------------------------------------------
/src/mcp_server_webcrawl/crawlers/archivebox/adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | import sqlite3
  4 | 
  5 | from contextlib import closing
  6 | from datetime import datetime, timezone
  7 | from pathlib import Path
  8 | 
  9 | from mcp_server_webcrawl.crawlers.base.adapter import (
 10 |     BaseManager,
 11 |     IndexState,
 12 |     IndexStatus,
 13 |     SitesGroup,
 14 |     INDEXED_BATCH_SIZE,
 15 |     INDEXED_TYPE_MAPPING,
 16 |     INDEXED_IGNORE_DIRECTORIES,
 17 | )
 18 | from mcp_server_webcrawl.crawlers.base.indexed import IndexedManager
 19 | from mcp_server_webcrawl.models.resources import (
 20 |     ResourceResult,
 21 |     ResourceResultType,
 22 |     RESOURCES_LIMIT_DEFAULT,
 23 | )
 24 | from mcp_server_webcrawl.models.sites import (
 25 |     SiteResult,
 26 |     SiteType,
 27 |     SITES_FIELDS_BASE,
 28 |     SITES_FIELDS_DEFAULT,
 29 | )
 30 | from mcp_server_webcrawl.utils.logger import get_logger
 31 | 
 32 | # skip metadata directories
 33 | ARCHIVEBOX_SKIP_DIRECTORIES: set[str] = {"media", "mercury"}
 34 | ARCHIVEBOX_COLLAPSE_FILENAMES: list[str] = ["/index.html", "/index.htm"]
 35 | 
 36 | logger = get_logger()
 37 | 
 38 | class ArchiveBoxManager(IndexedManager):
 39 |     """
 40 |     Manages ArchiveBox in-memory SQLite databases for session-level reuse.
 41 |     """
 42 | 
 43 |     def __init__(self) -> None:
 44 |         """
 45 |         Initialize the ArchiveBox manager with empty cache and statistics.
 46 |         """
 47 |         super().__init__()
 48 | 
 49 |     def _load_site_data(self, connection: sqlite3.Connection, site_directory: Path,
 50 |                        site_id: int, index_state: IndexState = None) -> None:
 51 |         """
 52 |         Load ArchiveBox site data into the database.
 53 |         
 54 |         Args:
 55 |             connection: SQLite connection
 56 |             site_directory: path to the ArchiveBox site directory (e.g., "example" or "pragmar")
 57 |             site_id: ID for the site
 58 |             index_state: IndexState object for tracking progress
 59 |         """
 60 |         # The site_directory should be something like "example" or "pragmar"
 61 |         # We need to look for the "archive" subdirectory within it
 62 |         archive_directory: Path = site_directory / "archive"
 63 | 
 64 |         if not archive_directory.exists() or not archive_directory.is_dir():
 65 |             logger.error(f"Archive directory not found in site: {archive_directory}")
 66 |             return
 67 | 
 68 |         if index_state is not None:
 69 |             index_state.set_status(IndexStatus.INDEXING)
 70 | 
 71 |         # page directories are timestamped (e.g. example/archive/1756357684.13023)
 72 |         # these contiain page data/media
 73 |         page_directories = self._get_page_directories(archive_directory)
 74 |         if not page_directories:
 75 |             logger.warning(f"No timestamped entries found in archive: {archive_directory}")
 76 |             return
 77 | 
 78 |         all_resources: list[ResourceResult] = []
 79 | 
 80 |         # process each timestamped entry
 81 |         for page_directory in page_directories:
 82 | 
 83 |             if index_state is not None and index_state.is_timeout():
 84 |                 index_state.set_status(IndexStatus.PARTIAL)
 85 |                 break
 86 | 
 87 |             try:
 88 |                 metadata = self._get_page_metadata(page_directory)
 89 |                 main_url: str = metadata["url"] if "url" in metadata else \
 90 |                     f"archivebox://unknown/{page_directory.name}"
 91 | 
 92 |                 # primary resource
 93 |                 main_resource = self._create_page_resource(page_directory, site_id, main_url, metadata)
 94 |                 if main_resource:
 95 |                     all_resources.append(main_resource)
 96 |                     if index_state is not None:
 97 |                         index_state.increment_processed()
 98 | 
 99 |                 # collect assets (external js/css/fonts/whatever)
100 |                 domain_assets = self._get_page_domain_assets(page_directory, main_url)
101 |                 for file_path, asset_url in domain_assets:
102 |                     asset_resource = self._create_asset_resource(file_path, site_id, asset_url, page_directory)
103 |                     if asset_resource:
104 |                         all_resources.append(asset_resource)
105 |                         if index_state is not None:
106 |                             index_state.increment_processed()
107 | 
108 |             except Exception as ex:
109 |                 logger.error(f"Error processing entry {page_directory}: {ex}")
110 | 
111 |         deduplicated_resources = self._dedupe_resources(all_resources)
112 |         with closing(connection.cursor()) as cursor:
113 |             for i in range(0, len(deduplicated_resources), INDEXED_BATCH_SIZE):
114 |                 batch = deduplicated_resources[i:i+INDEXED_BATCH_SIZE]
115 |                 self._execute_batch_insert(connection, cursor, batch)
116 | 
117 |         if index_state is not None and index_state.status == IndexStatus.INDEXING:
118 |             index_state.set_status(IndexStatus.COMPLETE)
119 | 
120 |     def _create_page_resource(self, resource_directory: Path, site_id: int,
121 |                 url: str, metadata: dict) -> ResourceResult | None:
122 |         """
123 |         Create ResourceResult for the main captured page.
124 |         """
125 |         try:
126 | 
127 |             # created/modified is directory stat
128 |             resource_stat: os.stat_result = resource_directory.stat()
129 |             created: datetime = datetime.fromtimestamp(resource_stat.st_ctime, tz=timezone.utc)
130 |             modified: datetime = datetime.fromtimestamp(resource_stat.st_mtime, tz=timezone.utc)
131 | 
132 |             # select best content, with appropriate fallbacks
133 |             html_file: Path = None
134 |             if "canonical" in metadata:
135 |                 # dom first, wget second, ignore singlefile (datauris generate too much storage)
136 |                 canonical: dict[str, str] = metadata["canonical"]
137 |                 prioritized_paths = ["dom_path", "wget_path"]
138 |                 for path_key in prioritized_paths:
139 |                     if path_key in canonical and canonical[path_key] is not None:
140 |                         candidate_file = resource_directory / canonical[path_key]
141 |                         if candidate_file.resolve().is_relative_to(resource_directory.resolve()) and candidate_file.exists():
142 |                             html_file = candidate_file
143 |                             break
144 | 
145 |             # fallback to ArchiveBox index file (metadata file - barely useful, but dependable)
146 |             if html_file is None:
147 |                 html_file = resource_directory / "index.html"
148 | 
149 |             # read content
150 |             content: str|None = None
151 |             file_size: int = 0
152 |             if html_file.exists():
153 |                 try:
154 |                     with open(html_file, "r", encoding="utf-8", errors="replace") as f:
155 |                         content = f.read()
156 |                     file_size: int = html_file.stat().st_size
157 |                 except Exception as ex:
158 |                     logger.warning(f"Could not read HTML from {html_file}: {ex}")
159 | 
160 |             # assemble metadata
161 |             status_code: int = 200
162 |             headers_reconstructed: str = ""
163 |             if "http_headers" in metadata:
164 |                 http_headers = metadata["http_headers"]
165 |                 if "status" in http_headers:
166 |                     try:
167 |                         status_code = int(str(http_headers["status"]).split()[0])
168 |                     except (ValueError, IndexError):
169 |                         pass
170 |                 headers_reconstructed = self._get_http_headers_string(http_headers)
171 | 
172 |             if not headers_reconstructed:
173 |                 headers_reconstructed = BaseManager.get_basic_headers(
174 |                         file_size, ResourceResultType.PAGE)
175 | 
176 |             return ResourceResult(
177 |                 id=BaseManager.string_to_id(url),
178 |                 site=site_id,
179 |                 created=created,
180 |                 modified=modified,
181 |                 url=url,
182 |                 type=ResourceResultType.PAGE,
183 |                 status=status_code,
184 |                 headers=headers_reconstructed,
185 |                 content=content,
186 |                 size=file_size,
187 |                 time=0
188 |             )
189 | 
190 |         except Exception as ex:
191 |             logger.error(f"Error creating main resource for {resource_directory}: {ex}")
192 |             return None
193 | 
194 |     def _create_asset_resource(self, file_path: Path, site_id: int, url: str, entry_dir: Path) -> ResourceResult | None:
195 |         """
196 |         Create ResourceResult for a domain asset file.
197 |         """
198 |         try:
199 |             # get file info
200 |             if not file_path.exists():
201 |                 return None
202 | 
203 |             file_stat = file_path.stat()
204 |             created: datetime = datetime.fromtimestamp(file_stat.st_ctime, tz=timezone.utc)
205 |             modified: datetime = datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc)
206 |             file_size: int = file_stat.st_size
207 |             extension: str = file_path.suffix.lower()
208 | 
209 |             # ArchiveBox will stuff URL args into @... in the filename
210 |             # sometimes it's the filename, sometimes the extension
211 |             # both need cleaning
212 |             clean_url: str = url.split("@")[0]
213 |             clean_extension: str = extension.split("@")[0]
214 |             resource_type: str = INDEXED_TYPE_MAPPING.get(clean_extension, ResourceResultType.OTHER)
215 | 
216 |             # read content for text files
217 |             content: str | None = BaseManager.read_file_contents(file_path, resource_type)
218 | 
219 |             return ResourceResult(
220 |                 id=BaseManager.string_to_id(clean_url),
221 |                 site=site_id,
222 |                 created=created,
223 |                 modified=modified,
224 |                 url=clean_url,
225 |                 type=resource_type,
226 |                 status=200,  # assume assets successful
227 |                 headers=BaseManager.get_basic_headers(file_size, resource_type, file_path),
228 |                 content=content,
229 |                 size=file_size,
230 |                 time=0
231 |             )
232 | 
233 |         except Exception as ex:
234 |             logger.error(f"Error creating asset resource for {file_path}: {ex}")
235 |             return None
236 | 
237 |     def _get_page_directories(self, archive_directory: Path) -> list[Path]:
238 |         """
239 |         Get webpage directories within ArchiveBox archive.
240 | 
241 |         Args:
242 |             archive_directory: path to the ArchiveBox archive directory
243 | 
244 |         Returns:
245 |             List of timestamped entry directory paths
246 |         """
247 | 
248 |         # page_directories are the timestamped directories,
249 |         # e.g. archive/1756342555.086082
250 |         page_directories = []
251 | 
252 |         if not archive_directory.is_dir():
253 |             return page_directories
254 | 
255 |         for item in archive_directory.iterdir():
256 |             # 1756342555.086082.replace(".", "") is numeric
257 |             if (item.is_dir() and item.name.replace(".", "").isdigit()):
258 |                 data_files: list[Path] = [
259 |                     (item / "index.json"),
260 |                     (item / "headers.json"),
261 |                     (item / "index.html"),
262 |                 ]
263 |                 for data_file in data_files:
264 |                     if data_file.exists():
265 |                         page_directories.append(item)
266 |                         break
267 | 
268 |         return sorted(page_directories)
269 | 
270 |     def _get_page_metadata(self, entry_directory: Path) -> dict:
271 |         """
272 |         Extract metadata from ArchiveBox entry files.
273 | 
274 |         Args:
275 |             entry_directory: path to the timestamped entry directory
276 | 
277 |         Returns:
278 |             Dictionary containing extracted metadata
279 |         """
280 |         page_metadata: dict[str, str] = {}
281 | 
282 |         # read index.json for primary URL and metadata
283 |         index_json_path: Path = entry_directory / "index.json"
284 |         if index_json_path.exists():
285 |             try:
286 |                 with open(index_json_path, "r", encoding="utf-8", errors="replace") as f:
287 |                     index_data = json.load(f)
288 |                     page_metadata.update(index_data)
289 |             except (json.JSONDecodeError, UnicodeDecodeError) as ex:
290 |                 logger.warning(f"Could not parse index.json from {entry_directory}: {ex}")
291 |             except Exception as ex:
292 |                 logger.error(f"Error reading index.json from {entry_directory}: {ex}")
293 | 
294 |         # read headers.json for HTTP headers
295 |         headers_json_path = entry_directory / "headers.json"
296 |         if headers_json_path.exists():
297 |             try:
298 |                 with open(headers_json_path, "r", encoding="utf-8", errors="replace") as f:
299 |                     http_headers = json.load(f)
300 |                     page_metadata["http_headers"] = http_headers
301 |             except (json.JSONDecodeError, UnicodeDecodeError) as ex:
302 |                 logger.warning(f"Could not parse headers.json from {entry_directory}: {ex}")
303 |             except Exception as ex:
304 |                 logger.error(f"Error reading headers.json from {entry_directory}: {ex}")
305 | 
306 |         return page_metadata
307 | 
308 |     def _get_page_domain_assets(self, entry_dir: Path, main_url: str) -> list[tuple[Path, str]]:
309 |         """
310 |         Collect all domain asset files within an entry.
311 | 
312 |         Args:
313 |             entry_dir: path to the timestamped entry
314 |             main_url: the main captured URL
315 | 
316 |         Returns:
317 |             List of (file_path, reconstructed_url) tuples
318 |         """
319 |         assets: list[tuple] = []
320 | 
321 | 
322 | 
323 |         for item in entry_dir.iterdir():
324 |             if item.is_dir() and item.name not in ARCHIVEBOX_SKIP_DIRECTORIES:
325 |                 # this is an archivebox domain directory
326 |                 domain_name: str = item.name
327 | 
328 |                 # walk domain directories for assets
329 |                 # (e.g. example/archive/1756357684.13023/example.com)
330 |                 for root, _, files in os.walk(item):
331 |                     for filename in files:
332 | 
333 |                         # *orig$ are dupes, not reliably in fileext form
334 |                         if filename.endswith("orig"):
335 |                             continue
336 | 
337 |                         file_path = Path(root) / filename
338 | 
339 |                         # clean up ArchiveBox's @timestamp suffixes for URL construction
340 |                         clean_filename: str = filename.split("@")[0]
341 |                         clean_file_path: Path = Path(root) / clean_filename
342 |                         relative_path = clean_file_path.relative_to(item)
343 |                         url = f"https://{domain_name}/{str(relative_path).replace(os.sep, '/')}"
344 |                         for collapse_filename in ARCHIVEBOX_COLLAPSE_FILENAMES:
345 |                             # turn ./index.html and variants into ./ (dir index) to help the indexer
346 |                             if url.endswith(collapse_filename):
347 |                                 url = url[:-(len(collapse_filename))] + "/"
348 |                                 break
349 | 
350 |                         # Use original file_path for reading, clean url for storage
351 |                         assets.append((file_path, url))
352 | 
353 |         return assets
354 | 
355 |     def _dedupe_resources(self, resources: list[ResourceResult]) -> list[ResourceResult]:
356 |         """
357 |         Deduplicate resources based on URL and metadata
358 | 
359 |         Args:
360 |             resources: list of ResourceResult objects
361 | 
362 |         Returns:
363 |             Deduplicated list of ResourceResult objects
364 |         """
365 |         seen_urls: dict[str, ResourceResult] = {}
366 |         deduplicated: list[ResourceResult] = []
367 |         resource: ResourceResult
368 |         for resource in resources:
369 |             if resource.url in seen_urls:
370 |                 # url collision, check if content differs, prefer newer
371 |                 existing = seen_urls[resource.url]
372 |                 if resource.modified and existing.modified:
373 |                     if resource.modified > existing.modified:
374 |                         deduplicated = [r for r in deduplicated if r.url != resource.url]
375 |                         deduplicated.append(resource)
376 |                         seen_urls[resource.url] = resource
377 |             else:
378 |                 # keep existing
379 |                 seen_urls[resource.url] = resource
380 |                 deduplicated.append(resource)
381 | 
382 |         return deduplicated
383 | 
384 |     def _get_http_headers_string(self, http_headers: dict) -> str:
385 |         """
386 |         Format headers dictionary as HTTP headers string.
387 |         """
388 |         if not http_headers:
389 |             return ""
390 | 
391 |         headers_lines: list[str] = []
392 |         status: int = http_headers.get("Status-Code", 200)
393 |         headers_lines.append(f"HTTP/1.0 {status}")
394 | 
395 |         for key, value in http_headers.items():
396 |             if key.lower() not in ["status-code"]:
397 |                 headers_lines.append(f"{key}: {value}")
398 | 
399 |         return "\r\n".join(headers_lines) + "\r\n\r\n"
400 | 
401 | 
402 | manager: ArchiveBoxManager = ArchiveBoxManager()
403 | 
404 | def get_sites(
405 |     datasrc: Path,
406 |     ids: list[int] | None = None,
407 |     fields: list[str] | None = None
408 | ) -> list[SiteResult]:
409 |     """
410 |     List ArchiveBox instances as separate sites.
411 |     Each subdirectory of datasrc that contains an "archive" folder is treated as a separate ArchiveBox instance.
412 | 
413 |     Args:
414 |         datasrc: path to the directory containing ArchiveBox instance directories
415 |         ids: optional list of site IDs to filter by
416 |         fields: optional list of fields to include in the response
417 | 
418 |     Returns:
419 |         List of SiteResult objects, one for each ArchiveBox instance
420 |     """
421 |     assert datasrc is not None, f"datasrc not provided ({datasrc})"
422 | 
423 |     if not datasrc.exists():
424 |         logger.error(f"Directory not found ({datasrc})")
425 |         return []
426 | 
427 |     # determine which fields to include
428 |     selected_fields: set[str] = set(SITES_FIELDS_BASE)
429 |     if fields:
430 |         valid_fields: set[str] = set(SITES_FIELDS_DEFAULT)
431 |         selected_fields.update(f for f in fields if f in valid_fields)
432 |     else:
433 |         selected_fields.update(SITES_FIELDS_DEFAULT)
434 | 
435 |     results: list[SiteResult] = []
436 | 
437 |     # get all directories that contain an "archive" subdirectory
438 |     site_directories: list[Path] = []
439 |     for datasrc_item in datasrc.iterdir():
440 |         if (
441 |                 datasrc_item.is_dir() and
442 |                 not datasrc_item.name.startswith(".") and
443 |                 datasrc_item.name not in INDEXED_IGNORE_DIRECTORIES and
444 |                 (datasrc_item / "archive").is_dir()
445 |             ):
446 |             site_directories.append(datasrc_item)
447 | 
448 |     # map directory IDs to paths for filtering
449 |     site_directories_map: dict[int, Path] = {BaseManager.string_to_id(d.name): d for d in site_directories}
450 | 
451 |     if ids:
452 |         site_directories_map = {id_val: path for id_val, path in site_directories_map.items() if id_val in ids}
453 | 
454 |     # process each ArchiveBox instance directory
455 |     for site_id, site_directory in sorted(site_directories_map.items()):
456 |         site_directory_stat = site_directory.stat()
457 |         created_time: datetime = datetime.fromtimestamp(site_directory_stat.st_ctime)
458 |         modified_time: datetime = datetime.fromtimestamp(site_directory_stat.st_mtime)
459 | 
460 |         site = SiteResult(
461 |             path=site_directory,
462 |             id=site_id,
463 |             name=site_directory.name,  # NEW: the directory name
464 |             type=SiteType.CRAWLED_LIST,  # NEW: always CRAWLED_LIST for archivebox
465 |             urls=[f"archivebox://{site_directory.name}/"],  # CHANGED: now a list
466 |             created=created_time if "created" in selected_fields else None,
467 |             modified=modified_time if "modified" in selected_fields else None,
468 |         )
469 | 
470 |         results.append(site)
471 | 
472 |     return results
473 | 
474 | def get_resources(
475 |     datasrc: Path,
476 |     sites: list[int] | None = None,
477 |     query: str = "",
478 |     fields: list[str] | None = None,
479 |     sort: str | None = None,
480 |     limit: int = RESOURCES_LIMIT_DEFAULT,
481 |     offset: int = 0,
482 | ) -> tuple[list[ResourceResult], int, IndexState]:
483 |     """
484 |     Get resources from ArchiveBox instances using in-memory SQLite.
485 | 
486 |     Args:
487 |         datasrc: path to the directory containing ArchiveBox instance directories
488 |         sites: optional list of site IDs to filter by
489 |         query: search query string
490 |         fields: optional list of fields to include in response
491 |         sort: sort order for results
492 |         limit: maximum number of results to return
493 |         offset: number of results to skip for pagination
494 | 
495 |     Returns:
496 |         Tuple of (list of ResourceResult objects, total count, IndexState)
497 |     """
498 |     sites_results: list[SiteResult] = get_sites(datasrc=datasrc, ids=sites)
499 |     assert sites_results, "At least one site is required to search"
500 | 
501 |     # use the actual site directories as paths (e.g., "example", "pragmar")
502 |     site_paths = [site.path for site in sites_results]
503 |     sites_group = SitesGroup(datasrc, sites or [site.id for site in sites_results], site_paths)
504 | 
505 |     return manager.get_resources_for_sites_group(sites_group, query, fields, sort, limit, offset)
506 | 
```
Page 8/35FirstPrevNextLast