This is page 32 of 33. Use http://codebase.md/pragmar/mcp_server_webcrawl?page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/base/tests.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.base.tests — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
<script src="../../../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../../../_static/doctools.js?v=888ff710"></script>
<script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
<li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.base.tests</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for mcp_server_webcrawl.crawlers.base.tests</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">unittest</span>
<span class="kn">import</span> <span class="nn">asyncio</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
<span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.crawler</span> <span class="kn">import</span> <span class="n">BaseCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.resources</span> <span class="kn">import</span> <span class="n">ResourceResultType</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.api</span> <span class="kn">import</span> <span class="n">BaseJsonApi</span>
<span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
<span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
<div class="viewcode-block" id="BaseCrawlerTests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests">[docs]</a>
<span class="k">class</span> <span class="nc">BaseCrawlerTests</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
<span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"crawler"</span>
<span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"privacy"</span>
<span class="n">__PRAGMAR_HYPHENATED_KEYWORD</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"one-click"</span>
<div class="viewcode-block" id="BaseCrawlerTests.setUp">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.setUp">[docs]</a>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="c1"># quiet asyncio error on tests, occurring after sucessful completion</span>
<span class="k">if</span> <span class="n">sys</span><span class="o">.</span><span class="n">platform</span> <span class="o">==</span> <span class="s2">"win32"</span><span class="p">:</span>
<span class="n">asyncio</span><span class="o">.</span><span class="n">set_event_loop_policy</span><span class="p">(</span><span class="n">asyncio</span><span class="o">.</span><span class="n">WindowsSelectorEventLoopPolicy</span><span class="p">())</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_search_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_search_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_search_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Run a battery of database checks on the crawler and Boolean validation</span>
<span class="sd"> """</span>
<span class="n">resources_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should have some resources in database"</span><span class="p">)</span>
<span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">])</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Pragmar site should have resources"</span><span class="p">)</span>
<span class="n">primary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Keyword '</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">' should return results"</span><span class="p">)</span>
<span class="n">secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Keyword '</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">' should return results"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_fulltext</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_status</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_headers</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_content</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_type</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_extras</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">,</span> <span class="n">primary_resources</span><span class="p">,</span> <span class="n">secondary_resources</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_image_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_image_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_image_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test InterroBot-specific image handling and thumbnails.</span>
<span class="sd"> """</span>
<span class="n">img_results</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"type: img"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">img_results</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Image type filter should return results"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="nb">all</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="s2">"img"</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">img_results</span><span class="o">.</span><span class="n">_results</span><span class="p">),</span>
<span class="s2">"All filtered resources should have type 'img'"</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_sites_resources_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_sites_resources_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_sites_resources_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">example_site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="n">resources_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should have some resources in database"</span><span class="p">)</span>
<span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">])</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Pragmar site should have resources"</span><span class="p">)</span>
<span class="c1"># basic resource retrieval</span>
<span class="n">resources_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span>
<span class="c1"># fulltext keyword search</span>
<span class="n">query_keyword1</span> <span class="o">=</span> <span class="s2">"privacy"</span>
<span class="n">timestamp_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="n">query_keyword1</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"created"</span><span class="p">,</span> <span class="s2">"modified"</span><span class="p">,</span> <span class="s2">"time"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">timestamp_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Search query should return results"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">timestamp_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="n">resource_dict</span> <span class="o">=</span> <span class="n">resource</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">resource_dict</span><span class="p">[</span><span class="s2">"created"</span><span class="p">],</span> <span class="s2">"Created timestamp should not be None"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">resource_dict</span><span class="p">[</span><span class="s2">"modified"</span><span class="p">],</span> <span class="s2">"Modified timestamp should not be None"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">resource_dict</span><span class="p">[</span><span class="s2">"time"</span><span class="p">],</span> <span class="s2">"Modified timestamp should not be None"</span><span class="p">)</span>
<span class="c1"># resource ID filtering</span>
<span class="k">if</span> <span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="n">first_resource</span> <span class="o">=</span> <span class="n">resources_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">id_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">first_resource</span><span class="o">.</span><span class="n">site</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"id: </span><span class="si">{</span><span class="n">first_resource</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">id_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">id_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">first_resource</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
<span class="c1"># site filtering</span>
<span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">])</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Site filtering should return results"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">site</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">)</span>
<span class="c1"># type filtering for HTML pages</span>
<span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"HTML filtering should return results"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="p">)</span>
<span class="c1"># type filtering for multiple resource types</span>
<span class="n">mixed_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2"> OR type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">mixed_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="n">types_found</span> <span class="o">=</span> <span class="p">{</span><span class="n">r</span><span class="o">.</span><span class="n">type</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">mixed_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="nb">len</span><span class="p">(</span><span class="n">types_found</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span>
<span class="s2">"Should find at least one of the requested resource types"</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">resource_type</span> <span class="ow">in</span> <span class="n">types_found</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span>
<span class="n">resource_type</span><span class="p">,</span>
<span class="p">[</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="p">]</span>
<span class="p">)</span>
<span class="c1"># custom fields in response</span>
<span class="n">custom_fields</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">,</span> <span class="s2">"time"</span><span class="p">]</span>
<span class="n">field_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"type: html"</span><span class="p">,</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
<span class="n">fields</span><span class="o">=</span><span class="n">custom_fields</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">field_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span>
<span class="n">resource_dict</span> <span class="o">=</span> <span class="n">field_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">custom_fields</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">resource_dict</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Field '</span><span class="si">{</span><span class="n">field</span><span class="si">}</span><span class="s2">' should be in response"</span><span class="p">)</span>
<span class="n">asc_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">asc_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">asc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">url</span> <span class="o"><=</span> <span class="n">asc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">)</span>
<span class="n">desc_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"-url"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">desc_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">desc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">url</span> <span class="o">>=</span> <span class="n">desc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">)</span>
<span class="n">limit_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">limit_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">3</span><span class="p">)</span>
<span class="n">offset_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">offset</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">offset_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">2</span><span class="p">)</span>
<span class="k">if</span> <span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">4</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertNotEqual</span><span class="p">(</span>
<span class="n">resources_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
<span class="n">offset_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
<span class="s2">"Offset results should differ from first page"</span>
<span class="p">)</span>
<span class="c1"># multi-site search, verify we got results from both sites</span>
<span class="c1"># limit 100 sees all the pages, otherwise ArchiveBox needs -url</span>
<span class="c1"># and everything else +url to float unique sites in a small result set</span>
<span class="c1"># limit 100 is slower but more resilient</span>
<span class="n">multisite_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">example_site_id</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">found_sites</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">multisite_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="n">found_sites</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">site</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">found_sites</span><span class="p">),</span> <span class="mi">2</span><span class="p">,</span> <span class="s2">"Should have results from both sites"</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_tokenizer_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_tokenizer_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_tokenizer_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span><span class="nb">int</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> fts hyphens and underscores are particularly challenging, thus</span>
<span class="sd"> have a dedicated test. these must be configured in multiple places</span>
<span class="sd"> including CREATE TABLE ... tokenizer, as well as handled by the query</span>
<span class="sd"> parser.</span>
<span class="sd"> """</span>
<span class="n">mcp_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl"'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">mcp_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl"'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
<span class="n">mcp_resources_wildcarded</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'mcp*'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_wildcarded</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-* in HTML"</span><span class="p">)</span>
<span class="n">combo_and_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl" AND "one-click"'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">combo_and_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl AND one-click'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_and_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_and_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_and_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_and_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
<span class="n">combo_or_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl" OR "one-click"'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">combo_or_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl OR one-click'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_or_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_or_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_or_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_or_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
<span class="n">combo_not_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl" NOT "one-click"'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">combo_not_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl NOT one-click'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">combo_and_not_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl AND NOT one-click'</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_not_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_and_not_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"NOT (</span><span class="si">{</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">) and AND NOT (</span><span class="si">{</span><span class="n">combo_and_not_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">) equivalence expected"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">combo_and_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Total records should be greater or equal to ANDs."</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">combo_or_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Total records should be less than or equal to ORs."</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Total records should be greater than NOTs."</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_site_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_site_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_site_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span><span class="nb">int</span><span class="p">):</span>
<span class="c1"># all sites</span>
<span class="n">sites_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">sites_json</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">2</span><span class="p">)</span>
<span class="c1"># single site</span>
<span class="n">site_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">(</span><span class="n">ids</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">])</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_json</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span>
<span class="c1"># site with fields</span>
<span class="n">site_field_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">(</span><span class="n">ids</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"created"</span><span class="p">,</span> <span class="s2">"modified"</span><span class="p">])</span>
<span class="n">site_field_result</span> <span class="o">=</span> <span class="n">site_field_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="s2">"created"</span> <span class="ow">in</span> <span class="n">site_field_result</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="s2">"modified"</span> <span class="ow">in</span> <span class="n">site_field_result</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_sort_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_sort_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_sort_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Test sorting functionality with performance optimizations.</span>
<span class="sd"> """</span>
<span class="n">sorted_default</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
<span class="n">sorted_url_ascending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
<span class="n">sorted_url_descending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"-url"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Database should contain resources"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Database should contain resources"</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_default</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="n">default_urls</span> <span class="o">=</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">url</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">sorted_default</span><span class="o">.</span><span class="n">_results</span><span class="p">]</span>
<span class="n">ascending_urls</span> <span class="o">=</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">url</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">default_urls</span><span class="p">,</span> <span class="n">ascending_urls</span><span class="p">,</span> <span class="s2">"Default sort should match +url sort"</span><span class="p">)</span>
<span class="n">sorted_size_ascending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"+size"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"size"</span><span class="p">])</span>
<span class="n">sorted_size_descending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"-size"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"size"</span><span class="p">])</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertLessEqual</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span>
<span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span> <span class="s2">"URLs should be ascending"</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span>
<span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span> <span class="s2">"URLs should be descending"</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertLessEqual</span><span class="p">(</span><span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span>
<span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span> <span class="s2">"Sizes should be ascending"</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span>
<span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span> <span class="s2">"Sizes should be descending"</span><span class="p">)</span>
<span class="n">random_1</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"?"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
<span class="n">random_2</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"?"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">random_1</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Random sort should return results"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">random_1</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">10</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertNotEqual</span><span class="p">([</span><span class="n">r</span><span class="o">.</span><span class="n">id</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">random_1</span><span class="o">.</span><span class="n">_results</span><span class="p">],</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">id</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">random_2</span><span class="o">.</span><span class="n">_results</span><span class="p">],</span>
<span class="s2">"Random sort should produce different orders"</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Skip randomness verification: Not enough resources (</span><span class="si">{</span><span class="n">random_1</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_content_tests">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_content_tests">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_content_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span><span class="nb">int</span><span class="p">,</span> <span class="n">html_leniency</span><span class="p">:</span> <span class="nb">bool</span><span class="p">):</span>
<span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">]</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find HTML resources"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="n">resource_dict</span> <span class="o">=</span> <span class="n">resource</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="k">if</span> <span class="s2">"content"</span> <span class="ow">in</span> <span class="n">resource_dict</span><span class="p">:</span>
<span class="n">content</span> <span class="o">=</span> <span class="n">resource_dict</span><span class="p">[</span><span class="s2">"content"</span><span class="p">]</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="s2">"<!DOCTYPE html>"</span> <span class="ow">in</span> <span class="n">content</span> <span class="ow">or</span>
<span class="s2">"<html"</span> <span class="ow">in</span> <span class="n">content</span> <span class="ow">or</span>
<span class="s2">"<meta"</span> <span class="ow">in</span> <span class="n">content</span> <span class="ow">or</span>
<span class="n">html_leniency</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"HTML content should contain HTML markup: </span><span class="si">{</span><span class="n">resource</span><span class="o">.</span><span class="n">url</span><span class="si">}</span><span class="se">\n\n</span><span class="si">{</span><span class="n">resource</span><span class="o">.</span><span class="n">content</span><span class="si">}</span><span class="s2">"</span>
<span class="p">)</span>
<span class="k">if</span> <span class="s2">"headers"</span> <span class="ow">in</span> <span class="n">resource_dict</span> <span class="ow">and</span> <span class="n">resource_dict</span><span class="p">[</span><span class="s2">"headers"</span><span class="p">]:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="s2">"Content-Type:"</span> <span class="ow">in</span> <span class="n">resource_dict</span><span class="p">[</span><span class="s2">"headers"</span><span class="p">],</span>
<span class="sa">f</span><span class="s2">"Headers should contain Content-Type: </span><span class="si">{</span><span class="n">resource</span><span class="o">.</span><span class="n">url</span><span class="si">}</span><span class="s2">"</span>
<span class="p">)</span>
<span class="c1"># script content detection</span>
<span class="n">script_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">script_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">script_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="p">)</span>
<span class="c1"># css content detection</span>
<span class="n">css_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">CSS</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">css_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">css_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">CSS</span><span class="p">)</span></div>
<div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_report">
<a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_report">[docs]</a>
<span class="k">def</span> <span class="nf">run_pragmar_report</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">heading</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
<span class="w"> </span><span class="sd">"""</span>
<span class="sd"> Generate a comprehensive report of all resources for a site.</span>
<span class="sd"> Returns a formatted string with counts and URLs by type.</span>
<span class="sd"> """</span>
<span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">""</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">css_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">CSS</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">js_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">image_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">IMAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">mcp_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (mcp)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">report_lines</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">sections</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">(</span><span class="s2">"Total pages"</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">),</span>
<span class="p">(</span><span class="s2">"Total HTML"</span><span class="p">,</span> <span class="n">html_resources</span><span class="p">),</span>
<span class="p">(</span><span class="s2">"Total MCP search hits"</span><span class="p">,</span> <span class="n">mcp_resources</span><span class="p">),</span>
<span class="p">(</span><span class="s2">"Total CSS"</span><span class="p">,</span> <span class="n">css_resources</span><span class="p">),</span>
<span class="p">(</span><span class="s2">"Total JS"</span><span class="p">,</span> <span class="n">js_resources</span><span class="p">),</span>
<span class="p">(</span><span class="s2">"Total Images"</span><span class="p">,</span> <span class="n">image_resources</span><span class="p">)</span>
<span class="p">]</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">section_name</span><span class="p">,</span> <span class="n">resource_obj</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">sections</span><span class="p">):</span>
<span class="n">report_lines</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">section_name</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">resource_obj</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">resource_obj</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="n">report_lines</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">url</span><span class="p">)</span>
<span class="k">if</span> <span class="n">i</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="n">sections</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">report_lines</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span>
<span class="n">now</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
<span class="n">lines_together</span> <span class="o">=</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">report_lines</span><span class="p">)</span>
<span class="k">return</span> <span class="sa">f</span><span class="s2">"""</span>
<span class="s2">**********************************************************************************</span>
<span class="s2">* </span><span class="si">{</span><span class="n">heading</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">now</span><span class="o">.</span><span class="n">isoformat</span><span class="p">()</span><span class="si">}</span><span class="s2"> *</span>
<span class="s2">**********************************************************************************</span>
<span class="si">{</span><span class="n">lines_together</span><span class="si">}</span>
<span class="s2">"""</span></div>
<span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_status</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># status code filtering</span>
<span class="n">status_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"status: 200"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">status_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Status filtering should return results"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">status_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">status</span><span class="p">,</span> <span class="mi">200</span><span class="p">)</span>
<span class="c1"># status code filtering</span>
<span class="n">appstat_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"status: 200 AND url: https://pragmar.com/appstat*"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">appstat_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Status filtering should return results"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">appstat_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">),</span> <span class="mi">3</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Should have at least 3 results in appstat resources"</span><span class="p">)</span>
<span class="c1"># multiple status codes</span>
<span class="n">multi_status_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"status: 200 OR status: 404"</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">multi_status_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="n">found_statuses</span> <span class="o">=</span> <span class="p">{</span><span class="n">r</span><span class="o">.</span><span class="n">status</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">multi_status_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">}</span>
<span class="k">for</span> <span class="n">status</span> <span class="ow">in</span> <span class="n">found_statuses</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="n">status</span><span class="p">,</span> <span class="p">[</span><span class="mi">200</span><span class="p">,</span> <span class="mi">404</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_headers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># supported crawls only (genuine headers data)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"InterroBotTests"</span><span class="p">,</span><span class="s2">"KatanaTests"</span><span class="p">,</span> <span class="s2">"WarcTests"</span><span class="p">):</span>
<span class="k">return</span>
<span class="n">appstat_any</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"appstat"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">appstat_headers_js</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"appstat AND headers: javascript"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># https://pragmar.com/media/static/scripts/js/appstat.min.js</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_headers_js</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have exactly one resource in database (appstat.min.js)"</span><span class="p">)</span>
<span class="n">appstat_headers_nojs</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"appstat NOT headers: javascript"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">appstat_headers_nojs</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have many appstat non-js resources in database"</span><span class="p">)</span>
<span class="n">appstat_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">appstat_headers_js</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">appstat_headers_nojs</span><span class="o">.</span><span class="n">total</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_sum</span><span class="p">,</span> <span class="n">appstat_any</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"appstat non-js + js resources should sum to all appstat"</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_content</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="n">mcp_any</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"mcp"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">mcp_content_configuration</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"mcp AND content: configuration"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># https://pragmar.com/mcp-server-webcrawl/</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="n">mcp_content_configuration</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have one, possibly more resources (mcp-server-webcrawl)"</span><span class="p">)</span>
<span class="n">mcp_content_no_configuration</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"mcp NOT content: configuration"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">mcp_content_no_configuration</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have many mcp non-configuration resources"</span><span class="p">)</span>
<span class="n">mcp_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">mcp_content_configuration</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">mcp_content_no_configuration</span><span class="o">.</span><span class="n">total</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">mcp_sum</span><span class="p">,</span> <span class="n">mcp_any</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"mcp non-config + config resources should sum to all mcp"</span><span class="p">)</span>
<span class="n">mcp_html_content_config</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND mcp AND content: configuration"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="n">mcp_html_content_config</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">mcp_content_configuration</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Adding type constraint should not increase results"</span>
<span class="p">)</span>
<span class="n">wildcard_content_search</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s1">'content: config*'</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">exact_config_search</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s1">'content: configuration'</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="n">wildcard_content_search</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">exact_config_search</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Wildcard content search should return at least as many results as exact match"</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_type</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"type: html"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># page count varies by crawler, 10 is conservative low end</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">html_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="s2">"Should have greater than 10 HTML resources"</span><span class="p">)</span>
<span class="n">not_html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"NOT type: html"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># wget is HTML-only fixture</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">not_html_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="s2">"Should have greater than 10 non-HTML resources"</span><span class="p">)</span>
<span class="n">html_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">not_html_resources</span><span class="o">.</span><span class="n">total</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">html_sum</span><span class="p">,</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"HTML + non-HTML should sum to all resources"</span><span class="p">)</span>
<span class="c1"># keyword + type combination</span>
<span class="n">appstat_any</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"appstat"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">appstat_script</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"appstat AND type: script"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># https://pragmar.com/media/static/scripts/js/appstat.min.js</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_script</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have exactly one appstat script (appstat.min.js)"</span><span class="p">)</span>
<span class="n">appstat_not_script</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"appstat NOT type: script"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">appstat_not_script</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have many appstat non-script resources"</span><span class="p">)</span>
<span class="n">appstat_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">appstat_script</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">appstat_not_script</span><span class="o">.</span><span class="n">total</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_sum</span><span class="p">,</span> <span class="n">appstat_any</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"appstat script + non-script should sum to all appstat"</span><span class="p">)</span>
<span class="c1"># type OR combinations</span>
<span class="n">html_or_img</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"type: html OR type: img"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">html_or_img</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="s2">"HTML + IMG should be greater than 20 resources"</span><span class="p">)</span>
<span class="n">img_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"type: img"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="n">html_or_img</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"OR should include all HTML resources"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="n">html_or_img</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">img_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"OR should include all IMG resources"</span>
<span class="p">)</span>
<span class="c1"># combined filtering</span>
<span class="n">combined_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"style AND type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
<span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
<span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">combined_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
<span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">combined_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">site</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__run_pragmar_search_tests_fulltext</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span>
<span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">site_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span>
<span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># Boolean workout</span>
<span class="c1"># result counts are fragile, intersections should not be</span>
<span class="c1"># counts are worth the fragility, for now</span>
<span class="n">boolean_primary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># varies by crawler, katana doesn't crawl /help/ depth by default</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary search should return results"</span><span class="p">)</span>
<span class="n">boolean_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">12</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># re: all these > 0 checks, result counts vary by crawler, all have default crawl behaviors/depths/externals</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">boolean_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Secondary search should return results"</span><span class="p">)</span>
<span class="c1"># AND</span>
<span class="n">primary_and_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> AND </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary AND Secondary should return results"</span><span class="p">)</span>
<span class="c1"># OR</span>
<span class="n">primary_or_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary OR Secondary should return results (union)"</span><span class="p">)</span>
<span class="c1"># NOT</span>
<span class="n">primary_not_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> NOT </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">secondary_not_primary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2"> NOT </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">secondary_not_primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Secondary NOT Primary should return results"</span><span class="p">)</span>
<span class="c1"># logical relationships</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
<span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">boolean_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">-</span> <span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Intersection should equal A + B - Union (inclusion-exclusion principle)"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
<span class="n">primary_not_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Primary NOT Secondary + Primary AND Secondary should equal total Primary results"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
<span class="n">secondary_not_primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="n">boolean_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Secondary NOT Primary + Primary AND Secondary should equal total Secondary results"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
<span class="n">primary_not_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">secondary_not_primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Sum of exclusive sets plus intersection should equal union"</span>
<span class="p">)</span>
<span class="c1"># complex boolean with field constraints</span>
<span class="n">primary_and_html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_and_html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary AND type:html should return results"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
<span class="n">primary_and_html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Adding AND constraints should not increase result count"</span>
<span class="p">)</span>
<span class="c1"># Parentheses grouping</span>
<span class="n">grouped_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">grouped_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Grouped OR with HTML filter should return results"</span><span class="p">)</span>
<span class="n">hyphenated_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_HYPHENATED_KEYWORD</span><span class="p">,</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">hyphenated_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Keyword '</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_HYPHENATED_KEYWORD</span><span class="si">}</span><span class="s2">' should return results"</span><span class="p">)</span>
<span class="n">double_or_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR moffitor)"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span>
<span class="n">double_or_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"OR query should return some results"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertLessEqual</span><span class="p">(</span>
<span class="n">double_or_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"OR query should be less than, or equal to all results"</span>
<span class="p">)</span>
<span class="n">parens_or_and_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">) AND collaborations "</span>
<span class="p">)</span>
<span class="c1"># respect the AND, there should be only one result</span>
<span class="c1"># (A OR B) AND C vs. A OR B AND C</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
<span class="n">parens_or_and_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"(A OR B) AND C should be 1 result (AND collaborations, unless fixture changed)"</span>
<span class="p">)</span>
<span class="n">parens_or_and_resources_reverse</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"collaborations AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">) "</span>
<span class="p">)</span>
<span class="c1"># respect the AND, there should be only one result</span>
<span class="c1"># (A OR B) AND C vs. A OR B AND C</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
<span class="n">parens_or_and_resources_reverse</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"A AND (B OR C) should be 1 result (collaborations AND, unless fixture changed)"</span>
<span class="p">)</span>
<span class="n">wide_type_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: script OR type: style OR type: iframe OR type: font OR type: text OR type: rss OR type: other"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertLess</span><span class="p">(</span>
<span class="n">wide_type_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"A long chained OR should not return all results"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span>
<span class="n">wide_type_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"A long chained OR should return some results"</span>
<span class="p">)</span>
<span class="n">complex_and</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> AND type:html AND status:200"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">complex_and</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Adding AND conditions should not increase results"</span><span class="p">)</span>
<span class="n">grouped_or</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">) AND type:html AND status:200"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">grouped_or</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Adding AND conditions to OR should not increase results"</span><span class="p">)</span>
<span class="c1"># URL OR parsing, url is a special case, an fts5 field searched with SQL LIKE</span>
<span class="n">url_or_simple</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"url: pragmar.com OR url: example.com"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">url_or_with_type</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"type: html AND (url: pragmar.com OR url: example.com)"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">html_total</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"type: html"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">url_or_with_type</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">url_or_simple</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"AND constraint should not increase results"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">url_or_with_type</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">html_total</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"URL filter should not exceed HTML total"</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">__run_pragmar_search_tests_extras</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span>
<span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">site_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">,</span>
<span class="n">primary_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">,</span>
<span class="n">secondary_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
<span class="n">snippet_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> AND type: html"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"snippets"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"snippets"</span><span class="p">,</span> <span class="n">snippet_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
<span class="s2">"First result should have snippets in extras"</span><span class="p">)</span>
<span class="n">xpath_count_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"markdown"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"markdown"</span><span class="p">,</span> <span class="n">xpath_count_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
<span class="s2">"First result should have markdown in extras"</span><span class="p">)</span>
<span class="n">xpath_count_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"url: pragmar.com AND status: 200"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"xpath"</span><span class="p">],</span>
<span class="n">extrasXpath</span><span class="o">=</span><span class="p">[</span><span class="s2">"count(//h1)"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">sort</span><span class="o">=</span><span class="s2">"-url"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"xpath"</span><span class="p">,</span> <span class="n">xpath_count_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
<span class="s2">"First result should have xpath in extras"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">xpath_count_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">][</span><span class="s2">"xpath"</span><span class="p">]),</span>
<span class="mi">1</span><span class="p">,</span> <span class="s2">"Should be exactly one H1 hit in xpath extras"</span><span class="p">)</span>
<span class="c1"># this test inadvertently also covers t_URL_FIELD parser testing</span>
<span class="n">xpath_h1_text_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="s2">"url: https://pragmar.com AND status: 200"</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"xpath"</span><span class="p">],</span>
<span class="n">extrasXpath</span><span class="o">=</span><span class="p">[</span><span class="s2">"//h1/text()"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"xpath"</span><span class="p">,</span> <span class="n">xpath_h1_text_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
<span class="s2">"First result should have xpath in extras"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span> <span class="n">xpath_h1_text_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span>
<span class="s2">"Should have pragmar in fixture h1"</span><span class="p">)</span>
<span class="c1"># should be pragmar homepage, assert "pragmar" in h1</span>
<span class="n">first_xpath_result</span> <span class="o">=</span> <span class="n">xpath_h1_text_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">][</span><span class="s2">"xpath"</span><span class="p">][</span><span class="mi">0</span><span class="p">][</span><span class="s2">"value"</span><span class="p">]</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="s2">"pragmar"</span> <span class="ow">in</span> <span class="n">first_xpath_result</span><span class="p">,</span>
<span class="sa">f</span><span class="s2">"Should have pragmar in fixture homepage h1 (</span><span class="si">{</span><span class="n">first_xpath_result</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span>
<span class="n">combined_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
<span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
<span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">,</span>
<span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"snippets"</span><span class="p">,</span> <span class="s2">"markdown"</span><span class="p">],</span>
<span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">first_result</span> <span class="o">=</span> <span class="n">combined_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"extras"</span><span class="p">,</span> <span class="n">first_result</span><span class="p">,</span> <span class="s2">"First result should have extras field"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"snippets"</span><span class="p">,</span> <span class="n">first_result</span><span class="p">[</span><span class="s2">"extras"</span><span class="p">],</span> <span class="s2">"First result should have snippets in extras"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"markdown"</span><span class="p">,</span> <span class="n">first_result</span><span class="p">[</span><span class="s2">"extras"</span><span class="p">],</span> <span class="s2">"First result should have markdown in extras"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Search should return less than or equivalent results to site total"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
<span class="s2">"Search should return less than or equivalent results to site total"</span><span class="p">)</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>
```