This is page 34 of 35. Use http://codebase.md/pragmar/mcp_server_webcrawl/utils/guides.html?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .gitignore
├── CONTRIBUTING.md
├── docs
│ ├── _images
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _modules
│ │ ├── index.html
│ │ ├── mcp_server_webcrawl
│ │ │ ├── crawlers
│ │ │ │ ├── archivebox
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── base
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── api.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ ├── indexed.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── httrack
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── interrobot
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── katana
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── siteone
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ ├── warc
│ │ │ │ │ ├── adapter.html
│ │ │ │ │ ├── crawler.html
│ │ │ │ │ └── tests.html
│ │ │ │ └── wget
│ │ │ │ ├── adapter.html
│ │ │ │ ├── crawler.html
│ │ │ │ └── tests.html
│ │ │ ├── crawlers.html
│ │ │ ├── extras
│ │ │ │ ├── markdown.html
│ │ │ │ ├── regex.html
│ │ │ │ ├── snippets.html
│ │ │ │ ├── thumbnails.html
│ │ │ │ └── xpath.html
│ │ │ ├── interactive
│ │ │ │ ├── highlights.html
│ │ │ │ ├── search.html
│ │ │ │ ├── session.html
│ │ │ │ └── ui.html
│ │ │ ├── main.html
│ │ │ ├── models
│ │ │ │ ├── resources.html
│ │ │ │ └── sites.html
│ │ │ ├── templates
│ │ │ │ └── tests.html
│ │ │ ├── utils
│ │ │ │ ├── blobs.html
│ │ │ │ ├── cli.html
│ │ │ │ ├── logger.html
│ │ │ │ ├── querycache.html
│ │ │ │ ├── server.html
│ │ │ │ └── tools.html
│ │ │ └── utils.html
│ │ └── re.html
│ ├── _sources
│ │ ├── guides
│ │ │ ├── archivebox.rst.txt
│ │ │ ├── httrack.rst.txt
│ │ │ ├── interrobot.rst.txt
│ │ │ ├── katana.rst.txt
│ │ │ ├── siteone.rst.txt
│ │ │ ├── warc.rst.txt
│ │ │ └── wget.rst.txt
│ │ ├── guides.rst.txt
│ │ ├── index.rst.txt
│ │ ├── installation.rst.txt
│ │ ├── interactive.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.archivebox.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.base.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.httrack.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.interrobot.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.katana.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.siteone.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.warc.rst.txt
│ │ ├── mcp_server_webcrawl.crawlers.wget.rst.txt
│ │ ├── mcp_server_webcrawl.extras.rst.txt
│ │ ├── mcp_server_webcrawl.interactive.rst.txt
│ │ ├── mcp_server_webcrawl.models.rst.txt
│ │ ├── mcp_server_webcrawl.rst.txt
│ │ ├── mcp_server_webcrawl.templates.rst.txt
│ │ ├── mcp_server_webcrawl.utils.rst.txt
│ │ ├── modules.rst.txt
│ │ ├── prompts.rst.txt
│ │ └── usage.rst.txt
│ ├── _static
│ │ ├── _sphinx_javascript_frameworks_compat.js
│ │ ├── basic.css
│ │ ├── css
│ │ │ ├── badge_only.css
│ │ │ ├── fonts
│ │ │ │ ├── fontawesome-webfont.eot
│ │ │ │ ├── fontawesome-webfont.svg
│ │ │ │ ├── fontawesome-webfont.ttf
│ │ │ │ ├── fontawesome-webfont.woff
│ │ │ │ ├── fontawesome-webfont.woff2
│ │ │ │ ├── lato-bold-italic.woff
│ │ │ │ ├── lato-bold-italic.woff2
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-normal-italic.woff
│ │ │ │ ├── lato-normal-italic.woff2
│ │ │ │ ├── lato-normal.woff
│ │ │ │ ├── lato-normal.woff2
│ │ │ │ ├── Roboto-Slab-Bold.woff
│ │ │ │ ├── Roboto-Slab-Bold.woff2
│ │ │ │ ├── Roboto-Slab-Regular.woff
│ │ │ │ └── Roboto-Slab-Regular.woff2
│ │ │ └── theme.css
│ │ ├── doctools.js
│ │ ├── documentation_options.js
│ │ ├── file.png
│ │ ├── fonts
│ │ │ ├── Lato
│ │ │ │ ├── lato-bold.eot
│ │ │ │ ├── lato-bold.ttf
│ │ │ │ ├── lato-bold.woff
│ │ │ │ ├── lato-bold.woff2
│ │ │ │ ├── lato-bolditalic.eot
│ │ │ │ ├── lato-bolditalic.ttf
│ │ │ │ ├── lato-bolditalic.woff
│ │ │ │ ├── lato-bolditalic.woff2
│ │ │ │ ├── lato-italic.eot
│ │ │ │ ├── lato-italic.ttf
│ │ │ │ ├── lato-italic.woff
│ │ │ │ ├── lato-italic.woff2
│ │ │ │ ├── lato-regular.eot
│ │ │ │ ├── lato-regular.ttf
│ │ │ │ ├── lato-regular.woff
│ │ │ │ └── lato-regular.woff2
│ │ │ └── RobotoSlab
│ │ │ ├── roboto-slab-v7-bold.eot
│ │ │ ├── roboto-slab-v7-bold.ttf
│ │ │ ├── roboto-slab-v7-bold.woff
│ │ │ ├── roboto-slab-v7-bold.woff2
│ │ │ ├── roboto-slab-v7-regular.eot
│ │ │ ├── roboto-slab-v7-regular.ttf
│ │ │ ├── roboto-slab-v7-regular.woff
│ │ │ └── roboto-slab-v7-regular.woff2
│ │ ├── images
│ │ │ ├── interactive.document.png
│ │ │ ├── interactive.document.webp
│ │ │ ├── interactive.search.png
│ │ │ ├── interactive.search.webp
│ │ │ └── mcpswc.svg
│ │ ├── jquery.js
│ │ ├── js
│ │ │ ├── badge_only.js
│ │ │ ├── theme.js
│ │ │ └── versions.js
│ │ ├── language_data.js
│ │ ├── minus.png
│ │ ├── plus.png
│ │ ├── pygments.css
│ │ ├── searchtools.js
│ │ └── sphinx_highlight.js
│ ├── .buildinfo
│ ├── .nojekyll
│ ├── genindex.html
│ ├── guides
│ │ ├── archivebox.html
│ │ ├── httrack.html
│ │ ├── interrobot.html
│ │ ├── katana.html
│ │ ├── siteone.html
│ │ ├── warc.html
│ │ └── wget.html
│ ├── guides.html
│ ├── index.html
│ ├── installation.html
│ ├── interactive.html
│ ├── mcp_server_webcrawl.crawlers.archivebox.html
│ ├── mcp_server_webcrawl.crawlers.base.html
│ ├── mcp_server_webcrawl.crawlers.html
│ ├── mcp_server_webcrawl.crawlers.httrack.html
│ ├── mcp_server_webcrawl.crawlers.interrobot.html
│ ├── mcp_server_webcrawl.crawlers.katana.html
│ ├── mcp_server_webcrawl.crawlers.siteone.html
│ ├── mcp_server_webcrawl.crawlers.warc.html
│ ├── mcp_server_webcrawl.crawlers.wget.html
│ ├── mcp_server_webcrawl.extras.html
│ ├── mcp_server_webcrawl.html
│ ├── mcp_server_webcrawl.interactive.html
│ ├── mcp_server_webcrawl.models.html
│ ├── mcp_server_webcrawl.templates.html
│ ├── mcp_server_webcrawl.utils.html
│ ├── modules.html
│ ├── objects.inv
│ ├── prompts.html
│ ├── py-modindex.html
│ ├── search.html
│ ├── searchindex.js
│ └── usage.html
├── LICENSE
├── MANIFEST.in
├── prompts
│ ├── audit404.md
│ ├── auditfiles.md
│ ├── auditperf.md
│ ├── auditseo.md
│ ├── gopher.md
│ ├── README.md
│ └── testsearch.md
├── pyproject.toml
├── README.md
├── setup.py
├── sphinx
│ ├── _static
│ │ └── images
│ │ ├── interactive.document.png
│ │ ├── interactive.document.webp
│ │ ├── interactive.search.png
│ │ ├── interactive.search.webp
│ │ └── mcpswc.svg
│ ├── _templates
│ │ └── layout.html
│ ├── conf.py
│ ├── guides
│ │ ├── archivebox.rst
│ │ ├── httrack.rst
│ │ ├── interrobot.rst
│ │ ├── katana.rst
│ │ ├── siteone.rst
│ │ ├── warc.rst
│ │ └── wget.rst
│ ├── guides.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── interactive.rst
│ ├── make.bat
│ ├── Makefile
│ ├── mcp_server_webcrawl.crawlers.archivebox.rst
│ ├── mcp_server_webcrawl.crawlers.base.rst
│ ├── mcp_server_webcrawl.crawlers.httrack.rst
│ ├── mcp_server_webcrawl.crawlers.interrobot.rst
│ ├── mcp_server_webcrawl.crawlers.katana.rst
│ ├── mcp_server_webcrawl.crawlers.rst
│ ├── mcp_server_webcrawl.crawlers.siteone.rst
│ ├── mcp_server_webcrawl.crawlers.warc.rst
│ ├── mcp_server_webcrawl.crawlers.wget.rst
│ ├── mcp_server_webcrawl.extras.rst
│ ├── mcp_server_webcrawl.interactive.rst
│ ├── mcp_server_webcrawl.models.rst
│ ├── mcp_server_webcrawl.rst
│ ├── mcp_server_webcrawl.templates.rst
│ ├── mcp_server_webcrawl.utils.rst
│ ├── modules.rst
│ ├── prompts.rst
│ ├── readme.txt
│ └── usage.rst
└── src
└── mcp_server_webcrawl
├── __init__.py
├── crawlers
│ ├── __init__.py
│ ├── archivebox
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── base
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── api.py
│ │ ├── crawler.py
│ │ ├── indexed.py
│ │ └── tests.py
│ ├── httrack
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── interrobot
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── katana
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── siteone
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ ├── warc
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── crawler.py
│ │ └── tests.py
│ └── wget
│ ├── __init__.py
│ ├── adapter.py
│ ├── crawler.py
│ └── tests.py
├── extras
│ ├── __init__.py
│ ├── markdown.py
│ ├── regex.py
│ ├── snippets.py
│ ├── thumbnails.py
│ └── xpath.py
├── interactive
│ ├── __init__.py
│ ├── highlights.py
│ ├── search.py
│ ├── session.py
│ ├── ui.py
│ └── views
│ ├── base.py
│ ├── document.py
│ ├── help.py
│ ├── requirements.py
│ ├── results.py
│ └── searchform.py
├── main.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── resources.py
│ └── sites.py
├── settings.py
├── templates
│ ├── __init__.py
│ ├── markdown.xslt
│ ├── tests_core.html
│ └── tests.py
└── utils
├── __init__.py
├── cli.py
├── logger.py
├── parser.py
├── parsetab.py
├── search.py
├── server.py
├── tests.py
└── tools.py
```
# Files
--------------------------------------------------------------------------------
/docs/_modules/mcp_server_webcrawl/crawlers/base/tests.html:
--------------------------------------------------------------------------------
```html
1 |
2 |
3 | <!DOCTYPE html>
4 | <html class="writer-html5" lang="en" data-content_root="../../../../">
5 | <head>
6 | <meta charset="utf-8" />
7 | <meta name="viewport" content="width=device-width, initial-scale=1.0" />
8 | <title>mcp_server_webcrawl.crawlers.base.tests — mcp-server-webcrawl documentation</title>
9 | <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
10 | <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=e59714d7" />
11 |
12 |
13 | <script src="../../../../_static/jquery.js?v=5d32c60e"></script>
14 | <script src="../../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
15 | <script src="../../../../_static/documentation_options.js?v=5929fcd5"></script>
16 | <script src="../../../../_static/doctools.js?v=888ff710"></script>
17 | <script src="../../../../_static/sphinx_highlight.js?v=dc90522c"></script>
18 | <script src="../../../../_static/js/theme.js"></script>
19 | <link rel="index" title="Index" href="../../../../genindex.html" />
20 | <link rel="search" title="Search" href="../../../../search.html" />
21 | </head>
22 |
23 | <body class="wy-body-for-nav">
24 | <div class="wy-grid-for-nav">
25 | <nav data-toggle="wy-nav-shift" class="wy-nav-side">
26 | <div class="wy-side-scroll">
27 | <div class="wy-side-nav-search" >
28 |
29 |
30 |
31 | <a href="../../../../index.html" class="icon icon-home">
32 | mcp-server-webcrawl
33 | </a>
34 | <div role="search">
35 | <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
36 | <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
37 | <input type="hidden" name="check_keywords" value="yes" />
38 | <input type="hidden" name="area" value="default" />
39 | </form>
40 | </div>
41 | </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
42 | <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
43 | <ul>
44 | <li class="toctree-l1"><a class="reference internal" href="../../../../installation.html">Installation</a></li>
45 | <li class="toctree-l1"><a class="reference internal" href="../../../../guides.html">Setup Guides</a></li>
46 | <li class="toctree-l1"><a class="reference internal" href="../../../../usage.html">Usage</a></li>
47 | <li class="toctree-l1"><a class="reference internal" href="../../../../prompts.html">Prompt Routines</a></li>
48 | <li class="toctree-l1"><a class="reference internal" href="../../../../interactive.html">Interactive Mode</a></li>
49 | <li class="toctree-l1"><a class="reference internal" href="../../../../modules.html">mcp_server_webcrawl</a></li>
50 | </ul>
51 |
52 | </div>
53 | </div>
54 | </nav>
55 |
56 | <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
57 | <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
58 | <a href="../../../../index.html">mcp-server-webcrawl</a>
59 | </nav>
60 |
61 | <div class="wy-nav-content">
62 | <div class="rst-content">
63 | <div role="navigation" aria-label="Page navigation">
64 | <ul class="wy-breadcrumbs">
65 | <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
66 | <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
67 | <li class="breadcrumb-item"><a href="../../crawlers.html">mcp_server_webcrawl.crawlers</a></li>
68 | <li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.base.tests</li>
69 | <li class="wy-breadcrumbs-aside">
70 | </li>
71 | </ul>
72 | <hr/>
73 | </div>
74 | <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
75 | <div itemprop="articleBody">
76 |
77 | <h1>Source code for mcp_server_webcrawl.crawlers.base.tests</h1><div class="highlight"><pre>
78 | <span></span><span class="kn">import</span> <span class="nn">sys</span>
79 | <span class="kn">import</span> <span class="nn">unittest</span>
80 | <span class="kn">import</span> <span class="nn">asyncio</span>
81 |
82 | <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Final</span>
83 | <span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
84 | <span class="kn">from</span> <span class="nn">logging</span> <span class="kn">import</span> <span class="n">Logger</span>
85 |
86 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.crawler</span> <span class="kn">import</span> <span class="n">BaseCrawler</span>
87 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.wget.crawler</span> <span class="kn">import</span> <span class="n">WgetCrawler</span>
88 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.models.resources</span> <span class="kn">import</span> <span class="n">ResourceResultType</span>
89 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.crawlers.base.api</span> <span class="kn">import</span> <span class="n">BaseJsonApi</span>
90 | <span class="kn">from</span> <span class="nn">mcp_server_webcrawl.utils.logger</span> <span class="kn">import</span> <span class="n">get_logger</span>
91 |
92 | <span class="n">logger</span><span class="p">:</span> <span class="n">Logger</span> <span class="o">=</span> <span class="n">get_logger</span><span class="p">()</span>
93 |
94 |
95 | <div class="viewcode-block" id="BaseCrawlerTests">
96 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests">[docs]</a>
97 | <span class="k">class</span> <span class="nc">BaseCrawlerTests</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
98 |
99 | <span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"crawler"</span>
100 | <span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"privacy"</span>
101 | <span class="n">__PRAGMAR_HYPHENATED_KEYWORD</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"one-click"</span>
102 |
103 | <div class="viewcode-block" id="BaseCrawlerTests.setUp">
104 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.setUp">[docs]</a>
105 | <span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
106 | <span class="c1"># quiet asyncio error on tests, occurring after sucessful completion</span>
107 | <span class="k">if</span> <span class="n">sys</span><span class="o">.</span><span class="n">platform</span> <span class="o">==</span> <span class="s2">"win32"</span><span class="p">:</span>
108 | <span class="n">asyncio</span><span class="o">.</span><span class="n">set_event_loop_policy</span><span class="p">(</span><span class="n">asyncio</span><span class="o">.</span><span class="n">WindowsSelectorEventLoopPolicy</span><span class="p">())</span></div>
109 |
110 |
111 |
112 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_search_tests">
113 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_search_tests">[docs]</a>
114 | <span class="k">def</span> <span class="nf">run_pragmar_search_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
115 | <span class="w"> </span><span class="sd">"""</span>
116 | <span class="sd"> Run a battery of database checks on the crawler and Boolean validation</span>
117 | <span class="sd"> """</span>
118 |
119 | <span class="n">resources_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">()</span>
120 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should have some resources in database"</span><span class="p">)</span>
121 |
122 | <span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">])</span>
123 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Pragmar site should have resources"</span><span class="p">)</span>
124 |
125 | <span class="n">primary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
126 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
127 | <span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">,</span>
128 | <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">],</span>
129 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
130 | <span class="p">)</span>
131 |
132 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Keyword '</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">' should return results"</span><span class="p">)</span>
133 |
134 | <span class="n">secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
135 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
136 | <span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="p">,</span>
137 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
138 | <span class="p">)</span>
139 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Keyword '</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">' should return results"</span><span class="p">)</span>
140 |
141 | <span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_fulltext</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">)</span>
142 | <span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_status</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
143 | <span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_headers</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
144 | <span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_content</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
145 | <span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_field_type</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">)</span>
146 | <span class="bp">self</span><span class="o">.</span><span class="n">__run_pragmar_search_tests_extras</span><span class="p">(</span><span class="n">crawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">,</span> <span class="n">primary_resources</span><span class="p">,</span> <span class="n">secondary_resources</span><span class="p">)</span></div>
147 |
148 |
149 |
150 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_image_tests">
151 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_image_tests">[docs]</a>
152 | <span class="k">def</span> <span class="nf">run_pragmar_image_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
153 | <span class="w"> </span><span class="sd">"""</span>
154 | <span class="sd"> Test InterroBot-specific image handling and thumbnails.</span>
155 | <span class="sd"> """</span>
156 | <span class="n">img_results</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"type: img"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
157 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">img_results</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Image type filter should return results"</span><span class="p">)</span>
158 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
159 | <span class="nb">all</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="s2">"img"</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">img_results</span><span class="o">.</span><span class="n">_results</span><span class="p">),</span>
160 | <span class="s2">"All filtered resources should have type 'img'"</span>
161 | <span class="p">)</span></div>
162 |
163 |
164 | <div class="viewcode-block" id="BaseCrawlerTests.run_sites_resources_tests">
165 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_sites_resources_tests">[docs]</a>
166 | <span class="k">def</span> <span class="nf">run_sites_resources_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">example_site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
167 |
168 | <span class="n">resources_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">()</span>
169 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should have some resources in database"</span><span class="p">)</span>
170 |
171 | <span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">])</span>
172 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Pragmar site should have resources"</span><span class="p">)</span>
173 |
174 | <span class="c1"># basic resource retrieval</span>
175 | <span class="n">resources_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">()</span>
176 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span>
177 |
178 | <span class="c1"># fulltext keyword search</span>
179 | <span class="n">query_keyword1</span> <span class="o">=</span> <span class="s2">"privacy"</span>
180 |
181 | <span class="n">timestamp_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
182 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
183 | <span class="n">query</span><span class="o">=</span><span class="n">query_keyword1</span><span class="p">,</span>
184 | <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"created"</span><span class="p">,</span> <span class="s2">"modified"</span><span class="p">,</span> <span class="s2">"time"</span><span class="p">],</span>
185 | <span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
186 | <span class="p">)</span>
187 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">timestamp_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Search query should return results"</span><span class="p">)</span>
188 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">timestamp_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
189 | <span class="n">resource_dict</span> <span class="o">=</span> <span class="n">resource</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
190 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">resource_dict</span><span class="p">[</span><span class="s2">"created"</span><span class="p">],</span> <span class="s2">"Created timestamp should not be None"</span><span class="p">)</span>
191 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">resource_dict</span><span class="p">[</span><span class="s2">"modified"</span><span class="p">],</span> <span class="s2">"Modified timestamp should not be None"</span><span class="p">)</span>
192 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">resource_dict</span><span class="p">[</span><span class="s2">"time"</span><span class="p">],</span> <span class="s2">"Modified timestamp should not be None"</span><span class="p">)</span>
193 |
194 | <span class="c1"># resource ID filtering</span>
195 | <span class="k">if</span> <span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
196 | <span class="n">first_resource</span> <span class="o">=</span> <span class="n">resources_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
197 | <span class="n">id_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
198 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">first_resource</span><span class="o">.</span><span class="n">site</span><span class="p">],</span>
199 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"id: </span><span class="si">{</span><span class="n">first_resource</span><span class="o">.</span><span class="n">id</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
200 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
201 | <span class="p">)</span>
202 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">id_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
203 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">id_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">id</span><span class="p">,</span> <span class="n">first_resource</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
204 |
205 | <span class="c1"># site filtering</span>
206 | <span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">])</span>
207 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Site filtering should return results"</span><span class="p">)</span>
208 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
209 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">site</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">)</span>
210 |
211 | <span class="c1"># type filtering for HTML pages</span>
212 | <span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
213 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
214 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
215 | <span class="p">)</span>
216 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"HTML filtering should return results"</span><span class="p">)</span>
217 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
218 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="p">)</span>
219 |
220 | <span class="c1"># type filtering for multiple resource types</span>
221 | <span class="n">mixed_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
222 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
223 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2"> OR type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
224 | <span class="p">)</span>
225 | <span class="k">if</span> <span class="n">mixed_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
226 | <span class="n">types_found</span> <span class="o">=</span> <span class="p">{</span><span class="n">r</span><span class="o">.</span><span class="n">type</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">mixed_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">}</span>
227 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
228 | <span class="nb">len</span><span class="p">(</span><span class="n">types_found</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span>
229 | <span class="s2">"Should find at least one of the requested resource types"</span>
230 | <span class="p">)</span>
231 | <span class="k">for</span> <span class="n">resource_type</span> <span class="ow">in</span> <span class="n">types_found</span><span class="p">:</span>
232 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span>
233 | <span class="n">resource_type</span><span class="p">,</span>
234 | <span class="p">[</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="p">]</span>
235 | <span class="p">)</span>
236 |
237 | <span class="c1"># custom fields in response</span>
238 | <span class="n">custom_fields</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">,</span> <span class="s2">"time"</span><span class="p">]</span>
239 | <span class="n">field_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
240 | <span class="n">query</span><span class="o">=</span><span class="s2">"type: html"</span><span class="p">,</span>
241 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span>
242 | <span class="n">fields</span><span class="o">=</span><span class="n">custom_fields</span><span class="p">,</span>
243 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
244 | <span class="p">)</span>
245 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">field_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">)</span>
246 | <span class="n">resource_dict</span> <span class="o">=</span> <span class="n">field_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
247 | <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">custom_fields</span><span class="p">:</span>
248 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">resource_dict</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Field '</span><span class="si">{</span><span class="n">field</span><span class="si">}</span><span class="s2">' should be in response"</span><span class="p">)</span>
249 |
250 | <span class="n">asc_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">)</span>
251 | <span class="k">if</span> <span class="n">asc_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
252 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">asc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">url</span> <span class="o"><=</span> <span class="n">asc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">)</span>
253 |
254 | <span class="n">desc_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"-url"</span><span class="p">)</span>
255 | <span class="k">if</span> <span class="n">desc_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
256 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">desc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">url</span> <span class="o">>=</span> <span class="n">desc_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">)</span>
257 |
258 | <span class="n">limit_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
259 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">limit_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">3</span><span class="p">)</span>
260 |
261 | <span class="n">offset_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">pragmar_site_id</span><span class="p">],</span> <span class="n">offset</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
262 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">offset_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">2</span><span class="p">)</span>
263 | <span class="k">if</span> <span class="n">resources_json</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">4</span><span class="p">:</span>
264 | <span class="bp">self</span><span class="o">.</span><span class="n">assertNotEqual</span><span class="p">(</span>
265 | <span class="n">resources_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
266 | <span class="n">offset_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
267 | <span class="s2">"Offset results should differ from first page"</span>
268 | <span class="p">)</span>
269 |
270 | <span class="c1"># multi-site search, verify we got results from both sites</span>
271 | <span class="c1"># limit 100 sees all the pages, otherwise ArchiveBox needs -url</span>
272 | <span class="c1"># and everything else +url to float unique sites in a small result set</span>
273 | <span class="c1"># limit 100 is slower but more resilient</span>
274 | <span class="n">multisite_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
275 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">example_site_id</span><span class="p">,</span> <span class="n">pragmar_site_id</span><span class="p">],</span>
276 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
277 | <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">,</span>
278 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
279 | <span class="p">)</span>
280 |
281 | <span class="n">found_sites</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
282 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">multisite_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
283 | <span class="n">found_sites</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">site</span><span class="p">)</span>
284 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">found_sites</span><span class="p">),</span> <span class="mi">2</span><span class="p">,</span> <span class="s2">"Should have results from both sites"</span><span class="p">)</span></div>
285 |
286 |
287 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_tokenizer_tests">
288 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_tokenizer_tests">[docs]</a>
289 | <span class="k">def</span> <span class="nf">run_pragmar_tokenizer_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span><span class="nb">int</span><span class="p">):</span>
290 | <span class="w"> </span><span class="sd">"""</span>
291 | <span class="sd"> fts hyphens and underscores are particularly challenging, thus</span>
292 | <span class="sd"> have a dedicated test. these must be configured in multiple places</span>
293 | <span class="sd"> including CREATE TABLE ... tokenizer, as well as handled by the query</span>
294 | <span class="sd"> parser.</span>
295 | <span class="sd"> """</span>
296 |
297 | <span class="n">mcp_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
298 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
299 | <span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl"'</span><span class="p">,</span>
300 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
301 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
302 | <span class="p">)</span>
303 | <span class="n">mcp_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
304 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
305 | <span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl"'</span><span class="p">,</span>
306 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
307 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
308 | <span class="p">)</span>
309 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
310 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
311 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
312 | <span class="n">mcp_resources_wildcarded</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
313 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
314 | <span class="n">query</span><span class="o">=</span><span class="s1">'mcp*'</span><span class="p">,</span>
315 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
316 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
317 | <span class="p">)</span>
318 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_wildcarded</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-* in HTML"</span><span class="p">)</span>
319 |
320 | <span class="n">combo_and_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
321 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
322 | <span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl" AND "one-click"'</span><span class="p">,</span>
323 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
324 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
325 | <span class="p">)</span>
326 | <span class="n">combo_and_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
327 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
328 | <span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl AND one-click'</span><span class="p">,</span>
329 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
330 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
331 | <span class="p">)</span>
332 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_and_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
333 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_and_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
334 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_and_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_and_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
335 |
336 | <span class="n">combo_or_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
337 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
338 | <span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl" OR "one-click"'</span><span class="p">,</span>
339 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
340 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
341 | <span class="p">)</span>
342 | <span class="n">combo_or_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
343 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
344 | <span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl OR one-click'</span><span class="p">,</span>
345 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
346 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
347 | <span class="p">)</span>
348 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_or_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
349 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_or_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
350 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_or_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_or_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
351 |
352 | <span class="n">combo_not_resources_keyword</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
353 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
354 | <span class="n">query</span><span class="o">=</span><span class="s1">'"mcp-server-webcrawl" NOT "one-click"'</span><span class="p">,</span>
355 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
356 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
357 | <span class="p">)</span>
358 | <span class="n">combo_not_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
359 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
360 | <span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl NOT one-click'</span><span class="p">,</span>
361 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
362 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
363 | <span class="p">)</span>
364 | <span class="n">combo_and_not_resources_quoted</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
365 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
366 | <span class="n">query</span><span class="o">=</span><span class="s1">'mcp-server-webcrawl AND NOT one-click'</span><span class="p">,</span>
367 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
368 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
369 | <span class="p">)</span>
370 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find mcp-server-webcrawl in HTML"</span><span class="p">)</span>
371 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_quoted</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find </span><span class="se">\"</span><span class="s2">mcp-server-webcrawl</span><span class="se">\"</span><span class="s2"> (phrase) in HTML"</span><span class="p">)</span>
372 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_not_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Quoted and unquoted equivalence expected"</span><span class="p">)</span>
373 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="n">combo_and_not_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"NOT (</span><span class="si">{</span><span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">) and AND NOT (</span><span class="si">{</span><span class="n">combo_and_not_resources_quoted</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">) equivalence expected"</span><span class="p">)</span>
374 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">combo_and_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Total records should be greater or equal to ANDs."</span><span class="p">)</span>
375 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">combo_or_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Total records should be less than or equal to ORs."</span><span class="p">)</span>
376 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">mcp_resources_keyword</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="n">combo_not_resources_keyword</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"Total records should be greater than NOTs."</span><span class="p">)</span></div>
377 |
378 |
379 |
380 |
381 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_site_tests">
382 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_site_tests">[docs]</a>
383 | <span class="k">def</span> <span class="nf">run_pragmar_site_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span><span class="nb">int</span><span class="p">):</span>
384 |
385 | <span class="c1"># all sites</span>
386 | <span class="n">sites_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">()</span>
387 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">sites_json</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">2</span><span class="p">)</span>
388 |
389 | <span class="c1"># single site</span>
390 | <span class="n">site_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">(</span><span class="n">ids</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">])</span>
391 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">site_json</span><span class="o">.</span><span class="n">total</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span>
392 |
393 | <span class="c1"># site with fields</span>
394 | <span class="n">site_field_json</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_sites_api</span><span class="p">(</span><span class="n">ids</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"created"</span><span class="p">,</span> <span class="s2">"modified"</span><span class="p">])</span>
395 | <span class="n">site_field_result</span> <span class="o">=</span> <span class="n">site_field_json</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
396 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="s2">"created"</span> <span class="ow">in</span> <span class="n">site_field_result</span><span class="p">)</span>
397 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="s2">"modified"</span> <span class="ow">in</span> <span class="n">site_field_result</span><span class="p">)</span></div>
398 |
399 |
400 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_sort_tests">
401 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_sort_tests">[docs]</a>
402 | <span class="k">def</span> <span class="nf">run_pragmar_sort_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
403 | <span class="w"> </span><span class="sd">"""</span>
404 | <span class="sd"> Test sorting functionality with performance optimizations.</span>
405 | <span class="sd"> """</span>
406 | <span class="n">sorted_default</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
407 | <span class="n">sorted_url_ascending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
408 | <span class="n">sorted_url_descending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"-url"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
409 |
410 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Database should contain resources"</span><span class="p">)</span>
411 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Database should contain resources"</span><span class="p">)</span>
412 | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_default</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
413 | <span class="n">default_urls</span> <span class="o">=</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">url</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">sorted_default</span><span class="o">.</span><span class="n">_results</span><span class="p">]</span>
414 | <span class="n">ascending_urls</span> <span class="o">=</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">url</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">]</span>
415 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">default_urls</span><span class="p">,</span> <span class="n">ascending_urls</span><span class="p">,</span> <span class="s2">"Default sort should match +url sort"</span><span class="p">)</span>
416 |
417 | <span class="n">sorted_size_ascending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"+size"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"size"</span><span class="p">])</span>
418 | <span class="n">sorted_size_descending</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"-size"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"size"</span><span class="p">])</span>
419 | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
420 | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
421 | <span class="bp">self</span><span class="o">.</span><span class="n">assertLessEqual</span><span class="p">(</span><span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span>
422 | <span class="n">sorted_url_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span> <span class="s2">"URLs should be ascending"</span><span class="p">)</span>
423 | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
424 | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
425 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span>
426 | <span class="n">sorted_url_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">url</span><span class="p">,</span> <span class="s2">"URLs should be descending"</span><span class="p">)</span>
427 | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
428 | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
429 | <span class="bp">self</span><span class="o">.</span><span class="n">assertLessEqual</span><span class="p">(</span><span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span>
430 | <span class="n">sorted_size_ascending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span> <span class="s2">"Sizes should be ascending"</span><span class="p">)</span>
431 | <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
432 | <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span>
433 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span>
434 | <span class="n">sorted_size_descending</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"size"</span><span class="p">],</span> <span class="s2">"Sizes should be descending"</span><span class="p">)</span>
435 |
436 | <span class="n">random_1</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"?"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
437 | <span class="n">random_2</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span><span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">sort</span><span class="o">=</span><span class="s2">"?"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">fields</span><span class="o">=</span><span class="p">[])</span>
438 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">random_1</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Random sort should return results"</span><span class="p">)</span>
439 | <span class="k">if</span> <span class="n">random_1</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">10</span><span class="p">:</span>
440 | <span class="bp">self</span><span class="o">.</span><span class="n">assertNotEqual</span><span class="p">([</span><span class="n">r</span><span class="o">.</span><span class="n">id</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">random_1</span><span class="o">.</span><span class="n">_results</span><span class="p">],</span> <span class="p">[</span><span class="n">r</span><span class="o">.</span><span class="n">id</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">random_2</span><span class="o">.</span><span class="n">_results</span><span class="p">],</span>
441 | <span class="s2">"Random sort should produce different orders"</span><span class="p">)</span>
442 | <span class="k">else</span><span class="p">:</span>
443 | <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Skip randomness verification: Not enough resources (</span><span class="si">{</span><span class="n">random_1</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span></div>
444 |
445 |
446 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_content_tests">
447 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_content_tests">[docs]</a>
448 | <span class="k">def</span> <span class="nf">run_pragmar_content_tests</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span><span class="nb">int</span><span class="p">,</span> <span class="n">html_leniency</span><span class="p">:</span> <span class="nb">bool</span><span class="p">):</span>
449 |
450 | <span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
451 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
452 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
453 | <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">]</span>
454 | <span class="p">)</span>
455 |
456 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Should find HTML resources"</span><span class="p">)</span>
457 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
458 | <span class="n">resource_dict</span> <span class="o">=</span> <span class="n">resource</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
459 | <span class="k">if</span> <span class="s2">"content"</span> <span class="ow">in</span> <span class="n">resource_dict</span><span class="p">:</span>
460 | <span class="n">content</span> <span class="o">=</span> <span class="n">resource_dict</span><span class="p">[</span><span class="s2">"content"</span><span class="p">]</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
461 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
462 | <span class="s2">"<!DOCTYPE html>"</span> <span class="ow">in</span> <span class="n">content</span> <span class="ow">or</span>
463 | <span class="s2">"<html"</span> <span class="ow">in</span> <span class="n">content</span> <span class="ow">or</span>
464 | <span class="s2">"<meta"</span> <span class="ow">in</span> <span class="n">content</span> <span class="ow">or</span>
465 | <span class="n">html_leniency</span><span class="p">,</span>
466 | <span class="sa">f</span><span class="s2">"HTML content should contain HTML markup: </span><span class="si">{</span><span class="n">resource</span><span class="o">.</span><span class="n">url</span><span class="si">}</span><span class="se">\n\n</span><span class="si">{</span><span class="n">resource</span><span class="o">.</span><span class="n">content</span><span class="si">}</span><span class="s2">"</span>
467 | <span class="p">)</span>
468 |
469 | <span class="k">if</span> <span class="s2">"headers"</span> <span class="ow">in</span> <span class="n">resource_dict</span> <span class="ow">and</span> <span class="n">resource_dict</span><span class="p">[</span><span class="s2">"headers"</span><span class="p">]:</span>
470 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
471 | <span class="s2">"Content-Type:"</span> <span class="ow">in</span> <span class="n">resource_dict</span><span class="p">[</span><span class="s2">"headers"</span><span class="p">],</span>
472 | <span class="sa">f</span><span class="s2">"Headers should contain Content-Type: </span><span class="si">{</span><span class="n">resource</span><span class="o">.</span><span class="n">url</span><span class="si">}</span><span class="s2">"</span>
473 | <span class="p">)</span>
474 |
475 | <span class="c1"># script content detection</span>
476 | <span class="n">script_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
477 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
478 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
479 | <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">],</span>
480 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
481 | <span class="p">)</span>
482 | <span class="k">if</span> <span class="n">script_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
483 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">script_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
484 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="p">)</span>
485 |
486 | <span class="c1"># css content detection</span>
487 | <span class="n">css_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
488 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
489 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">CSS</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
490 | <span class="n">fields</span><span class="o">=</span><span class="p">[</span><span class="s2">"content"</span><span class="p">,</span> <span class="s2">"headers"</span><span class="p">],</span>
491 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
492 | <span class="p">)</span>
493 | <span class="k">if</span> <span class="n">css_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
494 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">css_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
495 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">CSS</span><span class="p">)</span></div>
496 |
497 |
498 | <div class="viewcode-block" id="BaseCrawlerTests.run_pragmar_report">
499 | <a class="viewcode-back" href="../../../../mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests.run_pragmar_report">[docs]</a>
500 | <span class="k">def</span> <span class="nf">run_pragmar_report</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">heading</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
501 | <span class="w"> </span><span class="sd">"""</span>
502 | <span class="sd"> Generate a comprehensive report of all resources for a site.</span>
503 | <span class="sd"> Returns a formatted string with counts and URLs by type.</span>
504 | <span class="sd"> """</span>
505 |
506 | <span class="n">site_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
507 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
508 | <span class="n">query</span><span class="o">=</span><span class="s2">""</span><span class="p">,</span>
509 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
510 | <span class="p">)</span>
511 |
512 | <span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
513 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
514 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
515 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
516 | <span class="p">)</span>
517 |
518 | <span class="n">css_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
519 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
520 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">CSS</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
521 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
522 | <span class="p">)</span>
523 |
524 | <span class="n">js_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
525 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
526 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">SCRIPT</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
527 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
528 | <span class="p">)</span>
529 |
530 | <span class="n">image_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
531 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
532 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">IMAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
533 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
534 | <span class="p">)</span>
535 |
536 | <span class="n">mcp_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
537 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
538 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (mcp)"</span><span class="p">,</span>
539 | <span class="n">limit</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
540 | <span class="p">)</span>
541 |
542 | <span class="n">report_lines</span> <span class="o">=</span> <span class="p">[]</span>
543 | <span class="n">sections</span> <span class="o">=</span> <span class="p">[</span>
544 | <span class="p">(</span><span class="s2">"Total pages"</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">),</span>
545 | <span class="p">(</span><span class="s2">"Total HTML"</span><span class="p">,</span> <span class="n">html_resources</span><span class="p">),</span>
546 | <span class="p">(</span><span class="s2">"Total MCP search hits"</span><span class="p">,</span> <span class="n">mcp_resources</span><span class="p">),</span>
547 | <span class="p">(</span><span class="s2">"Total CSS"</span><span class="p">,</span> <span class="n">css_resources</span><span class="p">),</span>
548 | <span class="p">(</span><span class="s2">"Total JS"</span><span class="p">,</span> <span class="n">js_resources</span><span class="p">),</span>
549 | <span class="p">(</span><span class="s2">"Total Images"</span><span class="p">,</span> <span class="n">image_resources</span><span class="p">)</span>
550 | <span class="p">]</span>
551 |
552 | <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">section_name</span><span class="p">,</span> <span class="n">resource_obj</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">sections</span><span class="p">):</span>
553 | <span class="n">report_lines</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">section_name</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">resource_obj</span><span class="o">.</span><span class="n">total</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
554 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">resource_obj</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
555 | <span class="n">report_lines</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">url</span><span class="p">)</span>
556 | <span class="k">if</span> <span class="n">i</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="n">sections</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">:</span>
557 | <span class="n">report_lines</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span>
558 |
559 | <span class="n">now</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
560 | <span class="n">lines_together</span> <span class="o">=</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">report_lines</span><span class="p">)</span>
561 |
562 | <span class="k">return</span> <span class="sa">f</span><span class="s2">"""</span>
563 | <span class="s2">**********************************************************************************</span>
564 | <span class="s2">* </span><span class="si">{</span><span class="n">heading</span><span class="si">}</span><span class="s2"> </span><span class="si">{</span><span class="n">now</span><span class="o">.</span><span class="n">isoformat</span><span class="p">()</span><span class="si">}</span><span class="s2"> *</span>
565 | <span class="s2">**********************************************************************************</span>
566 | <span class="si">{</span><span class="n">lines_together</span><span class="si">}</span>
567 | <span class="s2">"""</span></div>
568 |
569 | <span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_status</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
570 |
571 | <span class="c1"># status code filtering</span>
572 | <span class="n">status_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
573 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
574 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"status: 200"</span><span class="p">,</span>
575 | <span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
576 | <span class="p">)</span>
577 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">status_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Status filtering should return results"</span><span class="p">)</span>
578 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">status_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
579 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">status</span><span class="p">,</span> <span class="mi">200</span><span class="p">)</span>
580 |
581 | <span class="c1"># status code filtering</span>
582 | <span class="n">appstat_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
583 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
584 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"status: 200 AND url: https://pragmar.com/appstat*"</span><span class="p">,</span>
585 | <span class="n">limit</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
586 | <span class="p">)</span>
587 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">appstat_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"Status filtering should return results"</span><span class="p">)</span>
588 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">appstat_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">),</span> <span class="mi">3</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Should have at least 3 results in appstat resources"</span><span class="p">)</span>
589 |
590 | <span class="c1"># multiple status codes</span>
591 | <span class="n">multi_status_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
592 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"status: 200 OR status: 404"</span><span class="p">,</span>
593 | <span class="p">)</span>
594 | <span class="k">if</span> <span class="n">multi_status_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
595 | <span class="n">found_statuses</span> <span class="o">=</span> <span class="p">{</span><span class="n">r</span><span class="o">.</span><span class="n">status</span> <span class="k">for</span> <span class="n">r</span> <span class="ow">in</span> <span class="n">multi_status_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">}</span>
596 | <span class="k">for</span> <span class="n">status</span> <span class="ow">in</span> <span class="n">found_statuses</span><span class="p">:</span>
597 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="n">status</span><span class="p">,</span> <span class="p">[</span><span class="mi">200</span><span class="p">,</span> <span class="mi">404</span><span class="p">])</span>
598 |
599 | <span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_headers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
600 |
601 | <span class="c1"># supported crawls only (genuine headers data)</span>
602 | <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"InterroBotTests"</span><span class="p">,</span><span class="s2">"KatanaTests"</span><span class="p">,</span> <span class="s2">"WarcTests"</span><span class="p">):</span>
603 | <span class="k">return</span>
604 |
605 | <span class="n">appstat_any</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
606 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
607 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"appstat"</span><span class="p">,</span>
608 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
609 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
610 | <span class="p">)</span>
611 |
612 | <span class="n">appstat_headers_js</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
613 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
614 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"appstat AND headers: javascript"</span><span class="p">,</span>
615 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
616 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
617 | <span class="p">)</span>
618 |
619 | <span class="c1"># https://pragmar.com/media/static/scripts/js/appstat.min.js</span>
620 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_headers_js</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have exactly one resource in database (appstat.min.js)"</span><span class="p">)</span>
621 |
622 | <span class="n">appstat_headers_nojs</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
623 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
624 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"appstat NOT headers: javascript"</span><span class="p">,</span>
625 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
626 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
627 | <span class="p">)</span>
628 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">appstat_headers_nojs</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have many appstat non-js resources in database"</span><span class="p">)</span>
629 |
630 | <span class="n">appstat_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">appstat_headers_js</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">appstat_headers_nojs</span><span class="o">.</span><span class="n">total</span>
631 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_sum</span><span class="p">,</span> <span class="n">appstat_any</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"appstat non-js + js resources should sum to all appstat"</span><span class="p">)</span>
632 |
633 | <span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_content</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
634 |
635 | <span class="n">mcp_any</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
636 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
637 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"mcp"</span><span class="p">,</span>
638 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
639 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
640 | <span class="p">)</span>
641 |
642 | <span class="n">mcp_content_configuration</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
643 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
644 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"mcp AND content: configuration"</span><span class="p">,</span>
645 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
646 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
647 | <span class="p">)</span>
648 |
649 | <span class="c1"># https://pragmar.com/mcp-server-webcrawl/</span>
650 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreaterEqual</span><span class="p">(</span><span class="n">mcp_content_configuration</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have one, possibly more resources (mcp-server-webcrawl)"</span><span class="p">)</span>
651 |
652 | <span class="n">mcp_content_no_configuration</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
653 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
654 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"mcp NOT content: configuration"</span><span class="p">,</span>
655 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
656 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
657 | <span class="p">)</span>
658 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">mcp_content_no_configuration</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have many mcp non-configuration resources"</span><span class="p">)</span>
659 |
660 | <span class="n">mcp_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">mcp_content_configuration</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">mcp_content_no_configuration</span><span class="o">.</span><span class="n">total</span>
661 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">mcp_sum</span><span class="p">,</span> <span class="n">mcp_any</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"mcp non-config + config resources should sum to all mcp"</span><span class="p">)</span>
662 |
663 | <span class="n">mcp_html_content_config</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
664 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
665 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND mcp AND content: configuration"</span><span class="p">,</span>
666 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
667 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
668 | <span class="p">)</span>
669 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
670 | <span class="n">mcp_html_content_config</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">mcp_content_configuration</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
671 | <span class="s2">"Adding type constraint should not increase results"</span>
672 | <span class="p">)</span>
673 |
674 | <span class="n">wildcard_content_search</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
675 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
676 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s1">'content: config*'</span><span class="p">,</span>
677 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
678 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
679 | <span class="p">)</span>
680 | <span class="n">exact_config_search</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
681 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
682 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s1">'content: configuration'</span><span class="p">,</span>
683 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
684 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
685 | <span class="p">)</span>
686 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
687 | <span class="n">wildcard_content_search</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">exact_config_search</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
688 | <span class="s2">"Wildcard content search should return at least as many results as exact match"</span>
689 | <span class="p">)</span>
690 |
691 | <span class="k">def</span> <span class="nf">__run_pragmar_search_tests_field_type</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span> <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">site_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
692 |
693 | <span class="n">html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
694 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
695 | <span class="n">query</span><span class="o">=</span><span class="s2">"type: html"</span><span class="p">,</span>
696 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
697 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
698 | <span class="p">)</span>
699 |
700 | <span class="c1"># page count varies by crawler, 10 is conservative low end</span>
701 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">html_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="s2">"Should have greater than 10 HTML resources"</span><span class="p">)</span>
702 |
703 | <span class="n">not_html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
704 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
705 | <span class="n">query</span><span class="o">=</span><span class="s2">"NOT type: html"</span><span class="p">,</span>
706 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
707 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
708 | <span class="p">)</span>
709 | <span class="c1"># wget is HTML-only fixture</span>
710 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">not_html_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="s2">"Should have greater than 10 non-HTML resources"</span><span class="p">)</span>
711 |
712 | <span class="n">html_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">not_html_resources</span><span class="o">.</span><span class="n">total</span>
713 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">html_sum</span><span class="p">,</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"HTML + non-HTML should sum to all resources"</span><span class="p">)</span>
714 |
715 | <span class="c1"># keyword + type combination</span>
716 | <span class="n">appstat_any</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
717 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
718 | <span class="n">query</span><span class="o">=</span><span class="s2">"appstat"</span><span class="p">,</span>
719 | <span class="n">limit</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
720 | <span class="p">)</span>
721 |
722 | <span class="n">appstat_script</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
723 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
724 | <span class="n">query</span><span class="o">=</span><span class="s2">"appstat AND type: script"</span><span class="p">,</span>
725 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
726 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
727 | <span class="p">)</span>
728 |
729 | <span class="c1"># https://pragmar.com/media/static/scripts/js/appstat.min.js</span>
730 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_script</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have exactly one appstat script (appstat.min.js)"</span><span class="p">)</span>
731 |
732 | <span class="n">appstat_not_script</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
733 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
734 | <span class="n">query</span><span class="o">=</span><span class="s2">"appstat NOT type: script"</span><span class="p">,</span>
735 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
736 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
737 | <span class="p">)</span>
738 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">appstat_not_script</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should have many appstat non-script resources"</span><span class="p">)</span>
739 |
740 | <span class="n">appstat_sum</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">appstat_script</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">appstat_not_script</span><span class="o">.</span><span class="n">total</span>
741 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">appstat_sum</span><span class="p">,</span> <span class="n">appstat_any</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="s2">"appstat script + non-script should sum to all appstat"</span><span class="p">)</span>
742 |
743 | <span class="c1"># type OR combinations</span>
744 | <span class="n">html_or_img</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
745 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
746 | <span class="n">query</span><span class="o">=</span><span class="s2">"type: html OR type: img"</span><span class="p">,</span>
747 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
748 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
749 | <span class="p">)</span>
750 |
751 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span><span class="n">html_or_img</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="s2">"HTML + IMG should be greater than 20 resources"</span><span class="p">)</span>
752 |
753 | <span class="n">img_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
754 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
755 | <span class="n">query</span><span class="o">=</span><span class="s2">"type: img"</span><span class="p">,</span>
756 | <span class="n">extras</span><span class="o">=</span><span class="p">[],</span>
757 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
758 | <span class="p">)</span>
759 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
760 | <span class="n">html_or_img</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">html_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
761 | <span class="s2">"OR should include all HTML resources"</span>
762 | <span class="p">)</span>
763 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
764 | <span class="n">html_or_img</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="n">img_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
765 | <span class="s2">"OR should include all IMG resources"</span>
766 | <span class="p">)</span>
767 |
768 | <span class="c1"># combined filtering</span>
769 | <span class="n">combined_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
770 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
771 | <span class="n">query</span><span class="o">=</span> <span class="sa">f</span><span class="s2">"style AND type: </span><span class="si">{</span><span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
772 | <span class="n">fields</span><span class="o">=</span><span class="p">[],</span>
773 | <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span><span class="p">,</span>
774 | <span class="n">limit</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span>
775 | <span class="p">)</span>
776 |
777 | <span class="k">if</span> <span class="n">combined_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span>
778 | <span class="k">for</span> <span class="n">resource</span> <span class="ow">in</span> <span class="n">combined_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">:</span>
779 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">site</span><span class="p">,</span> <span class="n">site_id</span><span class="p">)</span>
780 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="n">resource</span><span class="o">.</span><span class="n">type</span><span class="p">,</span> <span class="n">ResourceResultType</span><span class="o">.</span><span class="n">PAGE</span><span class="p">)</span>
781 |
782 | <span class="k">def</span> <span class="nf">__run_pragmar_search_tests_fulltext</span><span class="p">(</span>
783 | <span class="bp">self</span><span class="p">,</span>
784 | <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span>
785 | <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
786 | <span class="n">site_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span>
787 | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
788 |
789 | <span class="c1"># Boolean workout</span>
790 | <span class="c1"># result counts are fragile, intersections should not be</span>
791 | <span class="c1"># counts are worth the fragility, for now</span>
792 |
793 | <span class="n">boolean_primary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
794 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
795 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
796 | <span class="n">limit</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
797 | <span class="p">)</span>
798 |
799 | <span class="c1"># varies by crawler, katana doesn't crawl /help/ depth by default</span>
800 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary search should return results"</span><span class="p">)</span>
801 |
802 | <span class="n">boolean_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
803 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
804 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
805 | <span class="n">limit</span><span class="o">=</span><span class="mi">12</span><span class="p">,</span>
806 | <span class="p">)</span>
807 |
808 | <span class="c1"># re: all these > 0 checks, result counts vary by crawler, all have default crawl behaviors/depths/externals</span>
809 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">boolean_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Secondary search should return results"</span><span class="p">)</span>
810 |
811 | <span class="c1"># AND</span>
812 | <span class="n">primary_and_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
813 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
814 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> AND </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
815 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
816 | <span class="p">)</span>
817 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary AND Secondary should return results"</span><span class="p">)</span>
818 |
819 | <span class="c1"># OR</span>
820 | <span class="n">primary_or_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
821 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
822 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
823 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
824 | <span class="p">)</span>
825 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary OR Secondary should return results (union)"</span><span class="p">)</span>
826 |
827 | <span class="c1"># NOT</span>
828 | <span class="n">primary_not_secondary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
829 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
830 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> NOT </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
831 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
832 | <span class="p">)</span>
833 |
834 | <span class="n">secondary_not_primary_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
835 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
836 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2"> NOT </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
837 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
838 | <span class="p">)</span>
839 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">secondary_not_primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Secondary NOT Primary should return results"</span><span class="p">)</span>
840 |
841 | <span class="c1"># logical relationships</span>
842 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
843 | <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
844 | <span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">boolean_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">-</span> <span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
845 | <span class="s2">"Intersection should equal A + B - Union (inclusion-exclusion principle)"</span>
846 | <span class="p">)</span>
847 |
848 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
849 | <span class="n">primary_not_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
850 | <span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span><span class="p">,</span>
851 | <span class="s2">"Primary NOT Secondary + Primary AND Secondary should equal total Primary results"</span>
852 | <span class="p">)</span>
853 |
854 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
855 | <span class="n">secondary_not_primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
856 | <span class="n">boolean_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
857 | <span class="s2">"Secondary NOT Primary + Primary AND Secondary should equal total Secondary results"</span>
858 | <span class="p">)</span>
859 |
860 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
861 | <span class="n">primary_not_secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">secondary_not_primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">+</span> <span class="n">primary_and_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
862 | <span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
863 | <span class="s2">"Sum of exclusive sets plus intersection should equal union"</span>
864 | <span class="p">)</span>
865 |
866 | <span class="c1"># complex boolean with field constraints</span>
867 | <span class="n">primary_and_html_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
868 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
869 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
870 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
871 | <span class="p">)</span>
872 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_and_html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Primary AND type:html should return results"</span><span class="p">)</span>
873 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span>
874 | <span class="n">primary_and_html_resources</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span><span class="p">,</span>
875 | <span class="s2">"Adding AND constraints should not increase result count"</span>
876 | <span class="p">)</span>
877 |
878 | <span class="c1"># Parentheses grouping</span>
879 | <span class="n">grouped_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
880 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
881 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: html AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">)"</span><span class="p">,</span>
882 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
883 | <span class="p">)</span>
884 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">grouped_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Grouped OR with HTML filter should return results"</span><span class="p">)</span>
885 |
886 |
887 | <span class="n">hyphenated_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
888 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
889 | <span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_HYPHENATED_KEYWORD</span><span class="p">,</span>
890 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
891 | <span class="p">)</span>
892 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">hyphenated_resources</span><span class="o">.</span><span class="n">total</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Keyword '</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_HYPHENATED_KEYWORD</span><span class="si">}</span><span class="s2">' should return results"</span><span class="p">)</span>
893 |
894 | <span class="n">double_or_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
895 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
896 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR moffitor)"</span>
897 | <span class="p">)</span>
898 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span>
899 | <span class="n">double_or_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
900 | <span class="sa">f</span><span class="s2">"OR query should return some results"</span>
901 | <span class="p">)</span>
902 | <span class="bp">self</span><span class="o">.</span><span class="n">assertLessEqual</span><span class="p">(</span>
903 | <span class="n">double_or_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
904 | <span class="sa">f</span><span class="s2">"OR query should be less than, or equal to all results"</span>
905 | <span class="p">)</span>
906 | <span class="n">parens_or_and_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
907 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
908 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">) AND collaborations "</span>
909 | <span class="p">)</span>
910 | <span class="c1"># respect the AND, there should be only one result</span>
911 | <span class="c1"># (A OR B) AND C vs. A OR B AND C</span>
912 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
913 | <span class="n">parens_or_and_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span>
914 | <span class="sa">f</span><span class="s2">"(A OR B) AND C should be 1 result (AND collaborations, unless fixture changed)"</span>
915 | <span class="p">)</span>
916 |
917 | <span class="n">parens_or_and_resources_reverse</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
918 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
919 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"collaborations AND (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">) "</span>
920 | <span class="p">)</span>
921 | <span class="c1"># respect the AND, there should be only one result</span>
922 | <span class="c1"># (A OR B) AND C vs. A OR B AND C</span>
923 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span>
924 | <span class="n">parens_or_and_resources_reverse</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span>
925 | <span class="sa">f</span><span class="s2">"A AND (B OR C) should be 1 result (collaborations AND, unless fixture changed)"</span>
926 | <span class="p">)</span>
927 |
928 | <span class="n">wide_type_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
929 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
930 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"type: script OR type: style OR type: iframe OR type: font OR type: text OR type: rss OR type: other"</span>
931 | <span class="p">)</span>
932 |
933 | <span class="bp">self</span><span class="o">.</span><span class="n">assertLess</span><span class="p">(</span>
934 | <span class="n">wide_type_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
935 | <span class="sa">f</span><span class="s2">"A long chained OR should not return all results"</span>
936 | <span class="p">)</span>
937 | <span class="bp">self</span><span class="o">.</span><span class="n">assertGreater</span><span class="p">(</span>
938 | <span class="n">wide_type_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
939 | <span class="sa">f</span><span class="s2">"A long chained OR should return some results"</span>
940 | <span class="p">)</span>
941 |
942 | <span class="n">complex_and</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
943 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
944 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> AND type:html AND status:200"</span>
945 | <span class="p">)</span>
946 |
947 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">complex_and</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">boolean_primary_resources</span> <span class="o">.</span><span class="n">total</span><span class="p">,</span>
948 | <span class="s2">"Adding AND conditions should not increase results"</span><span class="p">)</span>
949 |
950 | <span class="n">grouped_or</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
951 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
952 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> OR </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_SECONDARY_KEYWORD</span><span class="si">}</span><span class="s2">) AND type:html AND status:200"</span>
953 | <span class="p">)</span>
954 |
955 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">grouped_or</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">primary_or_secondary_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
956 | <span class="s2">"Adding AND conditions to OR should not increase results"</span><span class="p">)</span>
957 |
958 | <span class="c1"># URL OR parsing, url is a special case, an fts5 field searched with SQL LIKE</span>
959 | <span class="n">url_or_simple</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
960 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"url: pragmar.com OR url: example.com"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
961 | <span class="n">url_or_with_type</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
962 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"type: html AND (url: pragmar.com OR url: example.com)"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
963 | <span class="n">html_total</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
964 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span> <span class="n">query</span><span class="o">=</span><span class="s2">"type: html"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
965 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">url_or_with_type</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">url_or_simple</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
966 | <span class="sa">f</span><span class="s2">"AND constraint should not increase results"</span><span class="p">)</span>
967 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">url_or_with_type</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">html_total</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
968 | <span class="sa">f</span><span class="s2">"URL filter should not exceed HTML total"</span><span class="p">)</span>
969 |
970 | <span class="k">def</span> <span class="nf">__run_pragmar_search_tests_extras</span><span class="p">(</span>
971 | <span class="bp">self</span><span class="p">,</span>
972 | <span class="n">crawler</span><span class="p">:</span> <span class="n">BaseCrawler</span><span class="p">,</span>
973 | <span class="n">site_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
974 | <span class="n">site_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">,</span>
975 | <span class="n">primary_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">,</span>
976 | <span class="n">secondary_resources</span><span class="p">:</span><span class="n">BaseJsonApi</span><span class="p">,</span>
977 | <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
978 |
979 | <span class="n">snippet_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
980 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
981 | <span class="n">query</span><span class="o">=</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="si">}</span><span class="s2"> AND type: html"</span><span class="p">,</span>
982 | <span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"snippets"</span><span class="p">],</span>
983 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
984 | <span class="p">)</span>
985 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"snippets"</span><span class="p">,</span> <span class="n">snippet_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
986 | <span class="s2">"First result should have snippets in extras"</span><span class="p">)</span>
987 |
988 | <span class="n">xpath_count_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
989 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
990 | <span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">,</span>
991 | <span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"markdown"</span><span class="p">],</span>
992 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
993 | <span class="p">)</span>
994 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"markdown"</span><span class="p">,</span> <span class="n">xpath_count_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
995 | <span class="s2">"First result should have markdown in extras"</span><span class="p">)</span>
996 |
997 | <span class="n">xpath_count_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
998 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
999 | <span class="n">query</span><span class="o">=</span><span class="s2">"url: pragmar.com AND status: 200"</span><span class="p">,</span>
1000 | <span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"xpath"</span><span class="p">],</span>
1001 | <span class="n">extrasXpath</span><span class="o">=</span><span class="p">[</span><span class="s2">"count(//h1)"</span><span class="p">],</span>
1002 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
1003 | <span class="n">sort</span><span class="o">=</span><span class="s2">"-url"</span>
1004 | <span class="p">)</span>
1005 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"xpath"</span><span class="p">,</span> <span class="n">xpath_count_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
1006 | <span class="s2">"First result should have xpath in extras"</span><span class="p">)</span>
1007 | <span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">xpath_count_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">][</span><span class="s2">"xpath"</span><span class="p">]),</span>
1008 | <span class="mi">1</span><span class="p">,</span> <span class="s2">"Should be exactly one H1 hit in xpath extras"</span><span class="p">)</span>
1009 |
1010 | <span class="c1"># this test inadvertently also covers t_URL_FIELD parser testing</span>
1011 | <span class="n">xpath_h1_text_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
1012 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
1013 | <span class="n">query</span><span class="o">=</span><span class="s2">"url: https://pragmar.com AND status: 200"</span><span class="p">,</span>
1014 | <span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"xpath"</span><span class="p">],</span>
1015 | <span class="n">extrasXpath</span><span class="o">=</span><span class="p">[</span><span class="s2">"//h1/text()"</span><span class="p">],</span>
1016 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
1017 | <span class="n">sort</span><span class="o">=</span><span class="s2">"+url"</span>
1018 | <span class="p">)</span>
1019 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"xpath"</span><span class="p">,</span> <span class="n">xpath_h1_text_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">],</span>
1020 | <span class="s2">"First result should have xpath in extras"</span><span class="p">)</span>
1021 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span> <span class="n">xpath_h1_text_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span>
1022 | <span class="s2">"Should have pragmar in fixture h1"</span><span class="p">)</span>
1023 |
1024 | <span class="c1"># should be pragmar homepage, assert "pragmar" in h1</span>
1025 | <span class="n">first_xpath_result</span> <span class="o">=</span> <span class="n">xpath_h1_text_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()[</span><span class="s2">"extras"</span><span class="p">][</span><span class="s2">"xpath"</span><span class="p">][</span><span class="mi">0</span><span class="p">][</span><span class="s2">"value"</span><span class="p">]</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span>
1026 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="s2">"pragmar"</span> <span class="ow">in</span> <span class="n">first_xpath_result</span><span class="p">,</span>
1027 | <span class="sa">f</span><span class="s2">"Should have pragmar in fixture homepage h1 (</span><span class="si">{</span><span class="n">first_xpath_result</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span>
1028 |
1029 | <span class="n">combined_resources</span> <span class="o">=</span> <span class="n">crawler</span><span class="o">.</span><span class="n">get_resources_api</span><span class="p">(</span>
1030 | <span class="n">sites</span><span class="o">=</span><span class="p">[</span><span class="n">site_id</span><span class="p">],</span>
1031 | <span class="n">query</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">__PRAGMAR_PRIMARY_KEYWORD</span><span class="p">,</span>
1032 | <span class="n">extras</span><span class="o">=</span><span class="p">[</span><span class="s2">"snippets"</span><span class="p">,</span> <span class="s2">"markdown"</span><span class="p">],</span>
1033 | <span class="n">limit</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
1034 | <span class="p">)</span>
1035 | <span class="n">first_result</span> <span class="o">=</span> <span class="n">combined_resources</span><span class="o">.</span><span class="n">_results</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span>
1036 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"extras"</span><span class="p">,</span> <span class="n">first_result</span><span class="p">,</span> <span class="s2">"First result should have extras field"</span><span class="p">)</span>
1037 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"snippets"</span><span class="p">,</span> <span class="n">first_result</span><span class="p">[</span><span class="s2">"extras"</span><span class="p">],</span> <span class="s2">"First result should have snippets in extras"</span><span class="p">)</span>
1038 | <span class="bp">self</span><span class="o">.</span><span class="n">assertIn</span><span class="p">(</span><span class="s2">"markdown"</span><span class="p">,</span> <span class="n">first_result</span><span class="p">[</span><span class="s2">"extras"</span><span class="p">],</span> <span class="s2">"First result should have markdown in extras"</span><span class="p">)</span>
1039 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">primary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
1040 | <span class="s2">"Search should return less than or equivalent results to site total"</span><span class="p">)</span>
1041 | <span class="bp">self</span><span class="o">.</span><span class="n">assertTrue</span><span class="p">(</span><span class="n">secondary_resources</span><span class="o">.</span><span class="n">total</span> <span class="o"><=</span> <span class="n">site_resources</span><span class="o">.</span><span class="n">total</span><span class="p">,</span>
1042 | <span class="s2">"Search should return less than or equivalent results to site total"</span><span class="p">)</span></div>
1043 |
1044 | </pre></div>
1045 |
1046 | </div>
1047 | </div>
1048 | <footer>
1049 |
1050 | <hr/>
1051 |
1052 | <div role="contentinfo">
1053 | <p>© Copyright 2025, pragmar.</p>
1054 | </div>
1055 |
1056 | Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
1057 | <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
1058 | provided by <a href="https://readthedocs.org">Read the Docs</a>.
1059 |
1060 |
1061 | </footer>
1062 | </div>
1063 | </div>
1064 | </section>
1065 | </div>
1066 | <script>
1067 | jQuery(function () {
1068 | SphinxRtdTheme.Navigation.enable(true);
1069 | });
1070 | </script>
1071 |
1072 | </body>
1073 | </html>
```