genomoncology/biomcp # codebase.md

This is page 14 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── actions
│   │   └── setup-python-env
│   │       └── action.yml
│   ├── dependabot.yml
│   └── workflows
│       ├── ci.yml
│       ├── deploy-docs.yml
│       ├── main.yml.disabled
│       ├── on-release-main.yml
│       └── validate-codecov-config.yml
├── .gitignore
├── .pre-commit-config.yaml
├── BIOMCP_DATA_FLOW.md
├── CHANGELOG.md
├── CNAME
├── codecov.yaml
├── docker-compose.yml
├── Dockerfile
├── docs
│   ├── apis
│   │   ├── error-codes.md
│   │   ├── overview.md
│   │   └── python-sdk.md
│   ├── assets
│   │   ├── biomcp-cursor-locations.png
│   │   ├── favicon.ico
│   │   ├── icon.png
│   │   ├── logo.png
│   │   ├── mcp_architecture.txt
│   │   └── remote-connection
│   │       ├── 00_connectors.png
│   │       ├── 01_add_custom_connector.png
│   │       ├── 02_connector_enabled.png
│   │       ├── 03_connect_to_biomcp.png
│   │       ├── 04_select_google_oauth.png
│   │       └── 05_success_connect.png
│   ├── backend-services-reference
│   │   ├── 01-overview.md
│   │   ├── 02-biothings-suite.md
│   │   ├── 03-cbioportal.md
│   │   ├── 04-clinicaltrials-gov.md
│   │   ├── 05-nci-cts-api.md
│   │   ├── 06-pubtator3.md
│   │   └── 07-alphagenome.md
│   ├── blog
│   │   ├── ai-assisted-clinical-trial-search-analysis.md
│   │   ├── images
│   │   │   ├── deep-researcher-video.png
│   │   │   ├── researcher-announce.png
│   │   │   ├── researcher-drop-down.png
│   │   │   ├── researcher-prompt.png
│   │   │   ├── trial-search-assistant.png
│   │   │   └── what_is_biomcp_thumbnail.png
│   │   └── researcher-persona-resource.md
│   ├── changelog.md
│   ├── CNAME
│   ├── concepts
│   │   ├── 01-what-is-biomcp.md
│   │   ├── 02-the-deep-researcher-persona.md
│   │   └── 03-sequential-thinking-with-the-think-tool.md
│   ├── developer-guides
│   │   ├── 01-server-deployment.md
│   │   ├── 02-contributing-and-testing.md
│   │   ├── 03-third-party-endpoints.md
│   │   ├── 04-transport-protocol.md
│   │   ├── 05-error-handling.md
│   │   ├── 06-http-client-and-caching.md
│   │   ├── 07-performance-optimizations.md
│   │   └── generate_endpoints.py
│   ├── faq-condensed.md
│   ├── FDA_SECURITY.md
│   ├── genomoncology.md
│   ├── getting-started
│   │   ├── 01-quickstart-cli.md
│   │   ├── 02-claude-desktop-integration.md
│   │   └── 03-authentication-and-api-keys.md
│   ├── how-to-guides
│   │   ├── 01-find-articles-and-cbioportal-data.md
│   │   ├── 02-find-trials-with-nci-and-biothings.md
│   │   ├── 03-get-comprehensive-variant-annotations.md
│   │   ├── 04-predict-variant-effects-with-alphagenome.md
│   │   ├── 05-logging-and-monitoring-with-bigquery.md
│   │   └── 06-search-nci-organizations-and-interventions.md
│   ├── index.md
│   ├── policies.md
│   ├── reference
│   │   ├── architecture-diagrams.md
│   │   ├── quick-architecture.md
│   │   ├── quick-reference.md
│   │   └── visual-architecture.md
│   ├── robots.txt
│   ├── stylesheets
│   │   ├── announcement.css
│   │   └── extra.css
│   ├── troubleshooting.md
│   ├── tutorials
│   │   ├── biothings-prompts.md
│   │   ├── claude-code-biomcp-alphagenome.md
│   │   ├── nci-prompts.md
│   │   ├── openfda-integration.md
│   │   ├── openfda-prompts.md
│   │   ├── pydantic-ai-integration.md
│   │   └── remote-connection.md
│   ├── user-guides
│   │   ├── 01-command-line-interface.md
│   │   ├── 02-mcp-tools-reference.md
│   │   └── 03-integrating-with-ides-and-clients.md
│   └── workflows
│       └── all-workflows.md
├── example_scripts
│   ├── mcp_integration.py
│   └── python_sdk.py
├── glama.json
├── LICENSE
├── lzyank.toml
├── Makefile
├── mkdocs.yml
├── package-lock.json
├── package.json
├── pyproject.toml
├── README.md
├── scripts
│   ├── check_docs_in_mkdocs.py
│   ├── check_http_imports.py
│   └── generate_endpoints_doc.py
├── smithery.yaml
├── src
│   └── biomcp
│       ├── __init__.py
│       ├── __main__.py
│       ├── articles
│       │   ├── __init__.py
│       │   ├── autocomplete.py
│       │   ├── fetch.py
│       │   ├── preprints.py
│       │   ├── search_optimized.py
│       │   ├── search.py
│       │   └── unified.py
│       ├── biomarkers
│       │   ├── __init__.py
│       │   └── search.py
│       ├── cbioportal_helper.py
│       ├── circuit_breaker.py
│       ├── cli
│       │   ├── __init__.py
│       │   ├── articles.py
│       │   ├── biomarkers.py
│       │   ├── diseases.py
│       │   ├── health.py
│       │   ├── interventions.py
│       │   ├── main.py
│       │   ├── openfda.py
│       │   ├── organizations.py
│       │   ├── server.py
│       │   ├── trials.py
│       │   └── variants.py
│       ├── connection_pool.py
│       ├── constants.py
│       ├── core.py
│       ├── diseases
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   └── search.py
│       ├── domain_handlers.py
│       ├── drugs
│       │   ├── __init__.py
│       │   └── getter.py
│       ├── exceptions.py
│       ├── genes
│       │   ├── __init__.py
│       │   └── getter.py
│       ├── http_client_simple.py
│       ├── http_client.py
│       ├── individual_tools.py
│       ├── integrations
│       │   ├── __init__.py
│       │   ├── biothings_client.py
│       │   └── cts_api.py
│       ├── interventions
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   └── search.py
│       ├── logging_filter.py
│       ├── metrics_handler.py
│       ├── metrics.py
│       ├── openfda
│       │   ├── __init__.py
│       │   ├── adverse_events_helpers.py
│       │   ├── adverse_events.py
│       │   ├── cache.py
│       │   ├── constants.py
│       │   ├── device_events_helpers.py
│       │   ├── device_events.py
│       │   ├── drug_approvals.py
│       │   ├── drug_labels_helpers.py
│       │   ├── drug_labels.py
│       │   ├── drug_recalls_helpers.py
│       │   ├── drug_recalls.py
│       │   ├── drug_shortages_detail_helpers.py
│       │   ├── drug_shortages_helpers.py
│       │   ├── drug_shortages.py
│       │   ├── exceptions.py
│       │   ├── input_validation.py
│       │   ├── rate_limiter.py
│       │   ├── utils.py
│       │   └── validation.py
│       ├── organizations
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   └── search.py
│       ├── parameter_parser.py
│       ├── prefetch.py
│       ├── query_parser.py
│       ├── query_router.py
│       ├── rate_limiter.py
│       ├── render.py
│       ├── request_batcher.py
│       ├── resources
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   ├── instructions.md
│       │   └── researcher.md
│       ├── retry.py
│       ├── router_handlers.py
│       ├── router.py
│       ├── shared_context.py
│       ├── thinking
│       │   ├── __init__.py
│       │   ├── sequential.py
│       │   └── session.py
│       ├── thinking_tool.py
│       ├── thinking_tracker.py
│       ├── trials
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   ├── nci_getter.py
│       │   ├── nci_search.py
│       │   └── search.py
│       ├── utils
│       │   ├── __init__.py
│       │   ├── cancer_types_api.py
│       │   ├── cbio_http_adapter.py
│       │   ├── endpoint_registry.py
│       │   ├── gene_validator.py
│       │   ├── metrics.py
│       │   ├── mutation_filter.py
│       │   ├── query_utils.py
│       │   ├── rate_limiter.py
│       │   └── request_cache.py
│       ├── variants
│       │   ├── __init__.py
│       │   ├── alphagenome.py
│       │   ├── cancer_types.py
│       │   ├── cbio_external_client.py
│       │   ├── cbioportal_mutations.py
│       │   ├── cbioportal_search_helpers.py
│       │   ├── cbioportal_search.py
│       │   ├── constants.py
│       │   ├── external.py
│       │   ├── filters.py
│       │   ├── getter.py
│       │   ├── links.py
│       │   └── search.py
│       └── workers
│           ├── __init__.py
│           ├── worker_entry_stytch.js
│           ├── worker_entry.js
│           └── worker.py
├── tests
│   ├── bdd
│   │   ├── cli_help
│   │   │   ├── help.feature
│   │   │   └── test_help.py
│   │   ├── conftest.py
│   │   ├── features
│   │   │   └── alphagenome_integration.feature
│   │   ├── fetch_articles
│   │   │   ├── fetch.feature
│   │   │   └── test_fetch.py
│   │   ├── get_trials
│   │   │   ├── get.feature
│   │   │   └── test_get.py
│   │   ├── get_variants
│   │   │   ├── get.feature
│   │   │   └── test_get.py
│   │   ├── search_articles
│   │   │   ├── autocomplete.feature
│   │   │   ├── search.feature
│   │   │   ├── test_autocomplete.py
│   │   │   └── test_search.py
│   │   ├── search_trials
│   │   │   ├── search.feature
│   │   │   └── test_search.py
│   │   ├── search_variants
│   │   │   ├── search.feature
│   │   │   └── test_search.py
│   │   └── steps
│   │       └── test_alphagenome_steps.py
│   ├── config
│   │   └── test_smithery_config.py
│   ├── conftest.py
│   ├── data
│   │   ├── ct_gov
│   │   │   ├── clinical_trials_api_v2.yaml
│   │   │   ├── trials_NCT04280705.json
│   │   │   └── trials_NCT04280705.txt
│   │   ├── myvariant
│   │   │   ├── myvariant_api.yaml
│   │   │   ├── myvariant_field_descriptions.csv
│   │   │   ├── variants_full_braf_v600e.json
│   │   │   ├── variants_full_braf_v600e.txt
│   │   │   └── variants_part_braf_v600_multiple.json
│   │   ├── openfda
│   │   │   ├── drugsfda_detail.json
│   │   │   ├── drugsfda_search.json
│   │   │   ├── enforcement_detail.json
│   │   │   └── enforcement_search.json
│   │   └── pubtator
│   │       ├── pubtator_autocomplete.json
│   │       └── pubtator3_paper.txt
│   ├── integration
│   │   ├── test_openfda_integration.py
│   │   ├── test_preprints_integration.py
│   │   ├── test_simple.py
│   │   └── test_variants_integration.py
│   ├── tdd
│   │   ├── articles
│   │   │   ├── test_autocomplete.py
│   │   │   ├── test_cbioportal_integration.py
│   │   │   ├── test_fetch.py
│   │   │   ├── test_preprints.py
│   │   │   ├── test_search.py
│   │   │   └── test_unified.py
│   │   ├── conftest.py
│   │   ├── drugs
│   │   │   ├── __init__.py
│   │   │   └── test_drug_getter.py
│   │   ├── openfda
│   │   │   ├── __init__.py
│   │   │   ├── test_adverse_events.py
│   │   │   ├── test_device_events.py
│   │   │   ├── test_drug_approvals.py
│   │   │   ├── test_drug_labels.py
│   │   │   ├── test_drug_recalls.py
│   │   │   ├── test_drug_shortages.py
│   │   │   └── test_security.py
│   │   ├── test_biothings_integration_real.py
│   │   ├── test_biothings_integration.py
│   │   ├── test_circuit_breaker.py
│   │   ├── test_concurrent_requests.py
│   │   ├── test_connection_pool.py
│   │   ├── test_domain_handlers.py
│   │   ├── test_drug_approvals.py
│   │   ├── test_drug_recalls.py
│   │   ├── test_drug_shortages.py
│   │   ├── test_endpoint_documentation.py
│   │   ├── test_error_scenarios.py
│   │   ├── test_europe_pmc_fetch.py
│   │   ├── test_mcp_integration.py
│   │   ├── test_mcp_tools.py
│   │   ├── test_metrics.py
│   │   ├── test_nci_integration.py
│   │   ├── test_nci_mcp_tools.py
│   │   ├── test_network_policies.py
│   │   ├── test_offline_mode.py
│   │   ├── test_openfda_unified.py
│   │   ├── test_pten_r173_search.py
│   │   ├── test_render.py
│   │   ├── test_request_batcher.py.disabled
│   │   ├── test_retry.py
│   │   ├── test_router.py
│   │   ├── test_shared_context.py.disabled
│   │   ├── test_unified_biothings.py
│   │   ├── thinking
│   │   │   ├── __init__.py
│   │   │   └── test_sequential.py
│   │   ├── trials
│   │   │   ├── test_backward_compatibility.py
│   │   │   ├── test_getter.py
│   │   │   └── test_search.py
│   │   ├── utils
│   │   │   ├── test_gene_validator.py
│   │   │   ├── test_mutation_filter.py
│   │   │   ├── test_rate_limiter.py
│   │   │   └── test_request_cache.py
│   │   ├── variants
│   │   │   ├── constants.py
│   │   │   ├── test_alphagenome_api_key.py
│   │   │   ├── test_alphagenome_comprehensive.py
│   │   │   ├── test_alphagenome.py
│   │   │   ├── test_cbioportal_mutations.py
│   │   │   ├── test_cbioportal_search.py
│   │   │   ├── test_external_integration.py
│   │   │   ├── test_external.py
│   │   │   ├── test_extract_gene_aa_change.py
│   │   │   ├── test_filters.py
│   │   │   ├── test_getter.py
│   │   │   ├── test_links.py
│   │   │   └── test_search.py
│   │   └── workers
│   │       └── test_worker_sanitization.js
│   └── test_pydantic_ai_integration.py
├── THIRD_PARTY_ENDPOINTS.md
├── tox.ini
├── uv.lock
└── wrangler.toml
```

# Files

--------------------------------------------------------------------------------
/src/biomcp/variants/external.py:
--------------------------------------------------------------------------------

```python
  1 | """External data sources for enhanced variant annotations."""
  2 | 
  3 | import asyncio
  4 | import json
  5 | import logging
  6 | import re
  7 | from typing import Any
  8 | from urllib.parse import quote
  9 | 
 10 | from pydantic import BaseModel, Field
 11 | 
 12 | from .. import http_client
 13 | 
 14 | # Import CBioPortalVariantData from the new module
 15 | from .cbio_external_client import CBioPortalVariantData
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | # TCGA/GDC API endpoints
 20 | GDC_BASE = "https://api.gdc.cancer.gov"
 21 | GDC_SSMS_ENDPOINT = f"{GDC_BASE}/ssms"  # Simple Somatic Mutations
 22 | 
 23 | # 1000 Genomes API endpoints
 24 | ENSEMBL_REST_BASE = "https://rest.ensembl.org"
 25 | ENSEMBL_VARIATION_ENDPOINT = f"{ENSEMBL_REST_BASE}/variation/human"
 26 | 
 27 | # Import constants
 28 | 
 29 | 
 30 | class TCGAVariantData(BaseModel):
 31 |     """TCGA/GDC variant annotation data."""
 32 | 
 33 |     cosmic_id: str | None = None
 34 |     tumor_types: list[str] = Field(default_factory=list)
 35 |     mutation_frequency: float | None = None
 36 |     mutation_count: int | None = None
 37 |     affected_cases: int | None = None
 38 |     consequence_type: str | None = None
 39 |     clinical_significance: str | None = None
 40 | 
 41 | 
 42 | class ThousandGenomesData(BaseModel):
 43 |     """1000 Genomes variant annotation data."""
 44 | 
 45 |     global_maf: float | None = Field(
 46 |         None, description="Global minor allele frequency"
 47 |     )
 48 |     afr_maf: float | None = Field(None, description="African population MAF")
 49 |     amr_maf: float | None = Field(None, description="American population MAF")
 50 |     eas_maf: float | None = Field(
 51 |         None, description="East Asian population MAF"
 52 |     )
 53 |     eur_maf: float | None = Field(None, description="European population MAF")
 54 |     sas_maf: float | None = Field(
 55 |         None, description="South Asian population MAF"
 56 |     )
 57 |     ancestral_allele: str | None = None
 58 |     most_severe_consequence: str | None = None
 59 | 
 60 | 
 61 | # CBioPortalVariantData is now imported from cbio_external_client.py
 62 | 
 63 | 
 64 | class EnhancedVariantAnnotation(BaseModel):
 65 |     """Enhanced variant annotation combining multiple sources."""
 66 | 
 67 |     variant_id: str
 68 |     tcga: TCGAVariantData | None = None
 69 |     thousand_genomes: ThousandGenomesData | None = None
 70 |     cbioportal: CBioPortalVariantData | None = None
 71 |     error_sources: list[str] = Field(default_factory=list)
 72 | 
 73 | 
 74 | class TCGAClient:
 75 |     """Client for TCGA/GDC API."""
 76 | 
 77 |     async def get_variant_data(
 78 |         self, variant_id: str
 79 |     ) -> TCGAVariantData | None:
 80 |         """Fetch variant data from TCGA/GDC.
 81 | 
 82 |         Args:
 83 |             variant_id: Can be gene AA change (e.g., "BRAF V600E") or genomic coordinates
 84 |         """
 85 |         try:
 86 |             # Determine the search field based on variant_id format
 87 |             # If it looks like "GENE AA_CHANGE" format, use gene_aa_change field
 88 |             if " " in variant_id and not variant_id.startswith("chr"):
 89 |                 search_field = "gene_aa_change"
 90 |                 search_value = variant_id
 91 |             else:
 92 |                 # Otherwise try genomic_dna_change
 93 |                 search_field = "genomic_dna_change"
 94 |                 search_value = variant_id
 95 | 
 96 |             # First, search for the variant
 97 |             params = {
 98 |                 "filters": json.dumps({
 99 |                     "op": "in",
100 |                     "content": {
101 |                         "field": search_field,
102 |                         "value": [search_value],
103 |                     },
104 |                 }),
105 |                 "fields": "cosmic_id,genomic_dna_change,gene_aa_change,ssm_id",
106 |                 "format": "json",
107 |                 "size": "5",  # Get a few in case of multiple matches
108 |             }
109 | 
110 |             response, error = await http_client.request_api(
111 |                 url=GDC_SSMS_ENDPOINT,
112 |                 method="GET",
113 |                 request=params,
114 |                 domain="gdc",
115 |             )
116 | 
117 |             if error or not response:
118 |                 return None
119 | 
120 |             data = response.get("data", {})
121 |             hits = data.get("hits", [])
122 | 
123 |             if not hits:
124 |                 return None
125 | 
126 |             # Get the first hit
127 |             hit = hits[0]
128 |             ssm_id = hit.get("ssm_id")
129 |             cosmic_id = hit.get("cosmic_id")
130 | 
131 |             # For gene_aa_change searches, verify we have the right variant
132 |             if search_field == "gene_aa_change":
133 |                 gene_aa_changes = hit.get("gene_aa_change", [])
134 |                 if (
135 |                     isinstance(gene_aa_changes, list)
136 |                     and search_value not in gene_aa_changes
137 |                 ):
138 |                     # This SSM has multiple AA changes, but not the one we're looking for
139 |                     return None
140 | 
141 |             if not ssm_id:
142 |                 return None
143 | 
144 |             # Now query SSM occurrences to get project information
145 |             occ_params = {
146 |                 "filters": json.dumps({
147 |                     "op": "in",
148 |                     "content": {"field": "ssm.ssm_id", "value": [ssm_id]},
149 |                 }),
150 |                 "fields": "case.project.project_id",
151 |                 "format": "json",
152 |                 "size": "2000",  # Get more occurrences
153 |             }
154 | 
155 |             occ_response, occ_error = await http_client.request_api(
156 |                 url="https://api.gdc.cancer.gov/ssm_occurrences",
157 |                 method="GET",
158 |                 request=occ_params,
159 |                 domain="gdc",
160 |             )
161 | 
162 |             if occ_error or not occ_response:
163 |                 # Return basic info without occurrence data
164 |                 cosmic_id_str = (
165 |                     cosmic_id[0]
166 |                     if isinstance(cosmic_id, list) and cosmic_id
167 |                     else cosmic_id
168 |                 )
169 |                 return TCGAVariantData(
170 |                     cosmic_id=cosmic_id_str,
171 |                     tumor_types=[],
172 |                     affected_cases=0,
173 |                     consequence_type="missense_variant",  # Most COSMIC variants are missense
174 |                 )
175 | 
176 |             # Process occurrence data
177 |             occ_data = occ_response.get("data", {})
178 |             occ_hits = occ_data.get("hits", [])
179 | 
180 |             # Count by project
181 |             project_counts = {}
182 |             for occ in occ_hits:
183 |                 case = occ.get("case", {})
184 |                 project = case.get("project", {})
185 |                 if project_id := project.get("project_id"):
186 |                     project_counts[project_id] = (
187 |                         project_counts.get(project_id, 0) + 1
188 |                     )
189 | 
190 |             # Extract tumor types
191 |             tumor_types = []
192 |             total_cases = 0
193 |             for project_id, count in project_counts.items():
194 |                 # Extract tumor type from project ID
195 |                 # TCGA format: "TCGA-LUAD" -> "LUAD"
196 |                 # Other formats: "MMRF-COMMPASS" -> "MMRF-COMMPASS", "CPTAC-3" -> "CPTAC-3"
197 |                 if project_id.startswith("TCGA-") and "-" in project_id:
198 |                     tumor_type = project_id.split("-")[-1]
199 |                     tumor_types.append(tumor_type)
200 |                 else:
201 |                     # For non-TCGA projects, use the full project ID
202 |                     tumor_types.append(project_id)
203 |                 total_cases += count
204 | 
205 |             # Handle cosmic_id as list
206 |             cosmic_id_str = (
207 |                 cosmic_id[0]
208 |                 if isinstance(cosmic_id, list) and cosmic_id
209 |                 else cosmic_id
210 |             )
211 | 
212 |             return TCGAVariantData(
213 |                 cosmic_id=cosmic_id_str,
214 |                 tumor_types=tumor_types,
215 |                 affected_cases=total_cases,
216 |                 consequence_type="missense_variant",  # Default for now
217 |             )
218 | 
219 |         except (KeyError, ValueError, TypeError, IndexError) as e:
220 |             # Log the error for debugging while gracefully handling API response issues
221 |             # KeyError: Missing expected fields in API response
222 |             # ValueError: Invalid data format or conversion issues
223 |             # TypeError: Unexpected data types in response
224 |             # IndexError: Array access issues with response data
225 |             logger.warning(
226 |                 f"Failed to fetch TCGA variant data for {variant_id}: {type(e).__name__}: {e}"
227 |             )
228 |             return None
229 | 
230 | 
231 | class ThousandGenomesClient:
232 |     """Client for 1000 Genomes data via Ensembl REST API."""
233 | 
234 |     def _extract_population_frequencies(
235 |         self, populations: list[dict]
236 |     ) -> dict[str, Any]:
237 |         """Extract population frequencies from Ensembl response."""
238 |         # Note: Multiple entries per population (one per allele), we want the alternate allele frequency
239 |         # The reference allele will have higher frequency for rare variants
240 |         pop_data: dict[str, float] = {}
241 | 
242 |         for pop in populations:
243 |             pop_name = pop.get("population", "")
244 |             frequency = pop.get("frequency", 0)
245 | 
246 |             # Map 1000 Genomes population codes - taking the minor allele frequency
247 |             if pop_name == "1000GENOMES:phase_3:ALL":
248 |                 if "global_maf" not in pop_data or frequency < pop_data.get(
249 |                     "global_maf", 1
250 |                 ):
251 |                     pop_data["global_maf"] = frequency
252 |             elif pop_name == "1000GENOMES:phase_3:AFR":
253 |                 if "afr_maf" not in pop_data or frequency < pop_data.get(
254 |                     "afr_maf", 1
255 |                 ):
256 |                     pop_data["afr_maf"] = frequency
257 |             elif pop_name == "1000GENOMES:phase_3:AMR":
258 |                 if "amr_maf" not in pop_data or frequency < pop_data.get(
259 |                     "amr_maf", 1
260 |                 ):
261 |                     pop_data["amr_maf"] = frequency
262 |             elif pop_name == "1000GENOMES:phase_3:EAS":
263 |                 if "eas_maf" not in pop_data or frequency < pop_data.get(
264 |                     "eas_maf", 1
265 |                 ):
266 |                     pop_data["eas_maf"] = frequency
267 |             elif pop_name == "1000GENOMES:phase_3:EUR":
268 |                 if "eur_maf" not in pop_data or frequency < pop_data.get(
269 |                     "eur_maf", 1
270 |                 ):
271 |                     pop_data["eur_maf"] = frequency
272 |             elif pop_name == "1000GENOMES:phase_3:SAS" and (
273 |                 "sas_maf" not in pop_data
274 |                 or frequency < pop_data.get("sas_maf", 1)
275 |             ):
276 |                 pop_data["sas_maf"] = frequency
277 | 
278 |         return pop_data
279 | 
280 |     async def get_variant_data(
281 |         self, variant_id: str
282 |     ) -> ThousandGenomesData | None:
283 |         """Fetch variant data from 1000 Genomes via Ensembl."""
284 |         try:
285 |             # Try to get rsID or use the variant ID directly
286 |             encoded_id = quote(variant_id, safe="")
287 |             url = f"{ENSEMBL_VARIATION_ENDPOINT}/{encoded_id}"
288 | 
289 |             # Request with pops=1 to get population data
290 |             params = {"content-type": "application/json", "pops": "1"}
291 | 
292 |             response, error = await http_client.request_api(
293 |                 url=url,
294 |                 method="GET",
295 |                 request=params,
296 |                 domain="ensembl",
297 |             )
298 | 
299 |             if error or not response:
300 |                 return None
301 | 
302 |             # Extract population frequencies
303 |             populations = response.get("populations", [])
304 |             pop_data = self._extract_population_frequencies(populations)
305 | 
306 |             # Get most severe consequence
307 |             consequence = None
308 |             if mappings := response.get("mappings", []):
309 |                 # Extract consequences from transcript consequences
310 |                 all_consequences = []
311 |                 for mapping in mappings:
312 |                     if transcript_consequences := mapping.get(
313 |                         "transcript_consequences", []
314 |                     ):
315 |                         for tc in transcript_consequences:
316 |                             if consequence_terms := tc.get(
317 |                                 "consequence_terms", []
318 |                             ):
319 |                                 all_consequences.extend(consequence_terms)
320 | 
321 |                 if all_consequences:
322 |                     # Take the first unique consequence
323 |                     seen = set()
324 |                     unique_consequences = []
325 |                     for c in all_consequences:
326 |                         if c not in seen:
327 |                             seen.add(c)
328 |                             unique_consequences.append(c)
329 |                     consequence = (
330 |                         unique_consequences[0] if unique_consequences else None
331 |                     )
332 | 
333 |             # Only return data if we found population frequencies
334 |             if pop_data:
335 |                 return ThousandGenomesData(
336 |                     **pop_data,
337 |                     ancestral_allele=response.get("ancestral_allele"),
338 |                     most_severe_consequence=consequence,
339 |                 )
340 |             else:
341 |                 # No population data found
342 |                 return None
343 | 
344 |         except (KeyError, ValueError, TypeError, AttributeError) as e:
345 |             # Log the error for debugging while gracefully handling API response issues
346 |             # KeyError: Missing expected fields in API response
347 |             # ValueError: Invalid data format or conversion issues
348 |             # TypeError: Unexpected data types in response
349 |             # AttributeError: Missing attributes on response objects
350 |             logger.warning(
351 |                 f"Failed to fetch 1000 Genomes data for {variant_id}: {type(e).__name__}: {e}"
352 |             )
353 |             return None
354 | 
355 | 
356 | class ExternalVariantAggregator:
357 |     """Aggregates variant data from multiple external sources."""
358 | 
359 |     def __init__(self):
360 |         self.tcga_client = TCGAClient()
361 |         self.thousand_genomes_client = ThousandGenomesClient()
362 |         # Import here to avoid circular imports
363 |         from .cbio_external_client import CBioPortalExternalClient
364 | 
365 |         self.cbioportal_client = CBioPortalExternalClient()
366 | 
367 |     def _extract_gene_aa_change(
368 |         self, variant_data: dict[str, Any]
369 |     ) -> str | None:
370 |         """Extract gene and AA change in format like 'BRAF V600A' from variant data."""
371 |         logger.info("_extract_gene_aa_change called")
372 |         try:
373 |             # First try to get gene name from CADD data
374 |             gene_name = None
375 |             if (cadd := variant_data.get("cadd")) and (
376 |                 gene := cadd.get("gene")
377 |             ):
378 |                 gene_name = gene.get("genename")
379 | 
380 |             # If not found in CADD, try other sources
381 |             if not gene_name:
382 |                 # Try docm
383 |                 if docm := variant_data.get("docm"):
384 |                     gene_name = docm.get("gene") or docm.get("genename")
385 | 
386 |                 # Try dbnsfp
387 |                 if not gene_name and (dbnsfp := variant_data.get("dbnsfp")):
388 |                     gene_name = dbnsfp.get("genename")
389 | 
390 |             if not gene_name:
391 |                 return None
392 | 
393 |             # Now try to get the protein change
394 |             aa_change = None
395 | 
396 |             # Try to get from docm first (it has clean p.V600A format)
397 |             if (docm := variant_data.get("docm")) and (
398 |                 aa := docm.get("aa_change")
399 |             ):
400 |                 # Convert p.V600A to V600A
401 |                 aa_change = aa.replace("p.", "")
402 | 
403 |             # Try hgvsp if not found
404 |             if (
405 |                 not aa_change
406 |                 and (hgvsp_list := variant_data.get("hgvsp"))
407 |                 and isinstance(hgvsp_list, list)
408 |                 and hgvsp_list
409 |             ):
410 |                 # Take the first one and clean it
411 |                 hgvsp = hgvsp_list[0]
412 |                 # Remove p. prefix
413 |                 aa_change = hgvsp.replace("p.", "")
414 |                 # Handle formats like Val600Ala -> V600A
415 |                 if "Val" in aa_change or "Ala" in aa_change:
416 |                     # Try to extract the short form
417 |                     match = re.search(r"[A-Z]\d+[A-Z]", aa_change)
418 |                     if match:
419 |                         aa_change = match.group()
420 | 
421 |             # Try CADD data
422 |             if (
423 |                 not aa_change
424 |                 and (cadd := variant_data.get("cadd"))
425 |                 and (gene_info := cadd.get("gene"))
426 |                 and (prot := gene_info.get("prot"))
427 |             ):
428 |                 protpos = prot.get("protpos")
429 |                 if protpos and cadd.get("oaa") and cadd.get("naa"):
430 |                     aa_change = f"{cadd['oaa']}{protpos}{cadd['naa']}"
431 | 
432 |             if gene_name and aa_change:
433 |                 result = f"{gene_name} {aa_change}"
434 |                 logger.info(f"Extracted gene/AA change: {result}")
435 |                 return result
436 | 
437 |             logger.warning(
438 |                 f"Failed to extract gene/AA change: gene_name={gene_name}, aa_change={aa_change}"
439 |             )
440 |             return None
441 |         except (
442 |             KeyError,
443 |             ValueError,
444 |             TypeError,
445 |             AttributeError,
446 |             re.error,
447 |         ) as e:
448 |             # Log the error for debugging while gracefully handling data extraction issues
449 |             # KeyError: Missing expected fields in variant data
450 |             # ValueError: Invalid data format or conversion issues
451 |             # TypeError: Unexpected data types in variant data
452 |             # AttributeError: Missing attributes on data objects
453 |             # re.error: Regular expression matching errors
454 |             logger.warning(
455 |                 f"Failed to extract gene/AA change from variant data: {type(e).__name__}: {e}"
456 |             )
457 |             return None
458 | 
459 |     async def get_enhanced_annotations(
460 |         self,
461 |         variant_id: str,
462 |         include_tcga: bool = True,
463 |         include_1000g: bool = True,
464 |         include_cbioportal: bool = True,
465 |         variant_data: dict[str, Any] | None = None,
466 |     ) -> EnhancedVariantAnnotation:
467 |         """Fetch and aggregate variant annotations from external sources.
468 | 
469 |         Args:
470 |             variant_id: The variant identifier (rsID or HGVS)
471 |             include_tcga: Whether to include TCGA data
472 |             include_1000g: Whether to include 1000 Genomes data
473 |             include_cbioportal: Whether to include cBioPortal data
474 |             variant_data: Optional variant data from MyVariant.info to extract gene/protein info
475 |         """
476 |         logger.info(
477 |             f"get_enhanced_annotations called for {variant_id}, include_cbioportal={include_cbioportal}"
478 |         )
479 |         tasks: list[Any] = []
480 |         task_names = []
481 | 
482 |         # Extract gene/AA change once for sources that need it
483 |         gene_aa_change = None
484 |         if variant_data:
485 |             logger.info(
486 |                 f"Extracting gene/AA from variant_data keys: {list(variant_data.keys())}"
487 |             )
488 |             gene_aa_change = self._extract_gene_aa_change(variant_data)
489 |         else:
490 |             logger.warning("No variant_data provided for gene/AA extraction")
491 | 
492 |         if include_tcga:
493 |             # Try to extract gene and protein change from variant data for TCGA
494 |             tcga_id = gene_aa_change if gene_aa_change else variant_id
495 |             tasks.append(self.tcga_client.get_variant_data(tcga_id))
496 |             task_names.append("tcga")
497 | 
498 |         if include_1000g:
499 |             tasks.append(
500 |                 self.thousand_genomes_client.get_variant_data(variant_id)
501 |             )
502 |             task_names.append("thousand_genomes")
503 | 
504 |         if include_cbioportal and gene_aa_change:
505 |             # cBioPortal requires gene/AA format
506 |             logger.info(
507 |                 f"Adding cBioPortal task with gene_aa_change: {gene_aa_change}"
508 |             )
509 |             tasks.append(
510 |                 self.cbioportal_client.get_variant_data(gene_aa_change)
511 |             )
512 |             task_names.append("cbioportal")
513 |         elif include_cbioportal and not gene_aa_change:
514 |             logger.warning(
515 |                 "Skipping cBioPortal: no gene/AA change could be extracted"
516 |             )
517 | 
518 |         # Run all queries in parallel
519 |         results = await asyncio.gather(*tasks, return_exceptions=True)
520 | 
521 |         # Build the enhanced annotation
522 |         annotation = EnhancedVariantAnnotation(variant_id=variant_id)
523 | 
524 |         for _i, (result, name) in enumerate(
525 |             zip(results, task_names, strict=False)
526 |         ):
527 |             if isinstance(result, Exception):
528 |                 annotation.error_sources.append(name)
529 |             elif result is not None:
530 |                 setattr(annotation, name, result)
531 |             else:
532 |                 # No data found for this source
533 |                 pass
534 | 
535 |         return annotation
536 | 
537 | 
538 | def format_enhanced_annotations(
539 |     annotation: EnhancedVariantAnnotation,
540 | ) -> dict[str, Any]:
541 |     """Format enhanced annotations for display."""
542 |     formatted: dict[str, Any] = {
543 |         "variant_id": annotation.variant_id,
544 |         "external_annotations": {},
545 |     }
546 | 
547 |     external_annot = formatted["external_annotations"]
548 | 
549 |     if annotation.tcga:
550 |         external_annot["tcga"] = {
551 |             "tumor_types": annotation.tcga.tumor_types,
552 |             "affected_cases": annotation.tcga.affected_cases,
553 |             "cosmic_id": annotation.tcga.cosmic_id,
554 |             "consequence": annotation.tcga.consequence_type,
555 |         }
556 | 
557 |     if annotation.thousand_genomes:
558 |         external_annot["1000_genomes"] = {
559 |             "global_maf": annotation.thousand_genomes.global_maf,
560 |             "population_frequencies": {
561 |                 "african": annotation.thousand_genomes.afr_maf,
562 |                 "american": annotation.thousand_genomes.amr_maf,
563 |                 "east_asian": annotation.thousand_genomes.eas_maf,
564 |                 "european": annotation.thousand_genomes.eur_maf,
565 |                 "south_asian": annotation.thousand_genomes.sas_maf,
566 |             },
567 |             "ancestral_allele": annotation.thousand_genomes.ancestral_allele,
568 |             "consequence": annotation.thousand_genomes.most_severe_consequence,
569 |         }
570 | 
571 |     if annotation.cbioportal:
572 |         cbio_data: dict[str, Any] = {
573 |             "studies": annotation.cbioportal.studies,
574 |             "total_cases": annotation.cbioportal.total_cases,
575 |         }
576 | 
577 |         # Add cancer type distribution if available
578 |         if annotation.cbioportal.cancer_type_distribution:
579 |             cbio_data["cancer_types"] = (
580 |                 annotation.cbioportal.cancer_type_distribution
581 |             )
582 | 
583 |         # Add mutation type distribution if available
584 |         if annotation.cbioportal.mutation_types:
585 |             cbio_data["mutation_types"] = annotation.cbioportal.mutation_types
586 | 
587 |         # Add hotspot count if > 0
588 |         if annotation.cbioportal.hotspot_count > 0:
589 |             cbio_data["hotspot_samples"] = annotation.cbioportal.hotspot_count
590 | 
591 |         # Add mean VAF if available
592 |         if annotation.cbioportal.mean_vaf is not None:
593 |             cbio_data["mean_vaf"] = annotation.cbioportal.mean_vaf
594 | 
595 |         # Add sample type distribution if available
596 |         if annotation.cbioportal.sample_types:
597 |             cbio_data["sample_types"] = annotation.cbioportal.sample_types
598 | 
599 |         external_annot["cbioportal"] = cbio_data
600 | 
601 |     if annotation.error_sources:
602 |         external_annot["errors"] = annotation.error_sources
603 | 
604 |     return formatted
605 | 
```

--------------------------------------------------------------------------------
/tests/tdd/trials/test_search.py:
--------------------------------------------------------------------------------

```python
  1 | import pytest
  2 | 
  3 | from biomcp.trials.search import (
  4 |     CLOSED_STATUSES,
  5 |     AgeGroup,
  6 |     DateField,
  7 |     InterventionType,
  8 |     LineOfTherapy,
  9 |     PrimaryPurpose,
 10 |     RecruitingStatus,
 11 |     SortOrder,
 12 |     SponsorType,
 13 |     StudyDesign,
 14 |     StudyType,
 15 |     TrialPhase,
 16 |     TrialQuery,
 17 |     _build_biomarker_expression_essie,
 18 |     _build_brain_mets_essie,
 19 |     _build_excluded_mutations_essie,
 20 |     _build_line_of_therapy_essie,
 21 |     _build_prior_therapy_essie,
 22 |     _build_progression_essie,
 23 |     _build_required_mutations_essie,
 24 |     _inject_ids,
 25 |     convert_query,
 26 | )
 27 | 
 28 | 
 29 | @pytest.mark.asyncio
 30 | async def test_convert_query_basic_parameters():
 31 |     """Test basic parameter conversion from TrialQuery to API format."""
 32 |     query = TrialQuery(conditions=["lung cancer"])
 33 |     params = await convert_query(query)
 34 | 
 35 |     assert "markupFormat" in params
 36 |     assert params["markupFormat"] == ["markdown"]
 37 |     assert "query.cond" in params
 38 |     assert params["query.cond"] == ["lung cancer"]
 39 |     assert "filter.overallStatus" in params
 40 |     assert "RECRUITING" in params["filter.overallStatus"][0]
 41 | 
 42 | 
 43 | @pytest.mark.asyncio
 44 | async def test_convert_query_multiple_conditions():
 45 |     """Test conversion of multiple conditions to API format."""
 46 |     query = TrialQuery(conditions=["lung cancer", "metastatic"])
 47 |     params = await convert_query(query)
 48 | 
 49 |     assert "query.cond" in params
 50 |     # The query should contain the original terms, but may have expanded synonyms
 51 |     cond_value = params["query.cond"][0]
 52 |     assert "lung cancer" in cond_value
 53 |     assert "metastatic" in cond_value
 54 |     assert cond_value.startswith("(") and cond_value.endswith(")")
 55 | 
 56 | 
 57 | @pytest.mark.asyncio
 58 | async def test_convert_query_terms_parameter():
 59 |     """Test conversion of terms parameter to API format."""
 60 |     query = TrialQuery(terms=["immunotherapy"])
 61 |     params = await convert_query(query)
 62 | 
 63 |     assert "query.term" in params
 64 |     assert params["query.term"] == ["immunotherapy"]
 65 | 
 66 | 
 67 | @pytest.mark.asyncio
 68 | async def test_convert_query_interventions_parameter():
 69 |     """Test conversion of interventions parameter to API format."""
 70 |     query = TrialQuery(interventions=["pembrolizumab"])
 71 |     params = await convert_query(query)
 72 | 
 73 |     assert "query.intr" in params
 74 |     assert params["query.intr"] == ["pembrolizumab"]
 75 | 
 76 | 
 77 | @pytest.mark.asyncio
 78 | async def test_convert_query_nct_ids():
 79 |     """Test conversion of NCT IDs to API format."""
 80 |     query = TrialQuery(nct_ids=["NCT04179552"])
 81 |     params = await convert_query(query)
 82 | 
 83 |     assert "query.id" in params
 84 |     assert params["query.id"] == ["NCT04179552"]
 85 |     # Note: The implementation keeps filter.overallStatus when using nct_ids
 86 |     # So we don't assert its absence
 87 | 
 88 | 
 89 | @pytest.mark.asyncio
 90 | async def test_convert_query_recruiting_status():
 91 |     """Test conversion of recruiting status to API format."""
 92 |     # Test open status
 93 |     query = TrialQuery(recruiting_status=RecruitingStatus.OPEN)
 94 |     params = await convert_query(query)
 95 | 
 96 |     assert "filter.overallStatus" in params
 97 |     assert "RECRUITING" in params["filter.overallStatus"][0]
 98 | 
 99 |     # Test closed status
100 |     query = TrialQuery(recruiting_status=RecruitingStatus.CLOSED)
101 |     params = await convert_query(query)
102 | 
103 |     assert "filter.overallStatus" in params
104 |     assert all(
105 |         status in params["filter.overallStatus"][0]
106 |         for status in CLOSED_STATUSES
107 |     )
108 | 
109 |     # Test any status
110 |     query = TrialQuery(recruiting_status=RecruitingStatus.ANY)
111 |     params = await convert_query(query)
112 | 
113 |     assert "filter.overallStatus" not in params
114 | 
115 | 
116 | @pytest.mark.asyncio
117 | async def test_convert_query_location_parameters():
118 |     """Test conversion of location parameters to API format."""
119 |     query = TrialQuery(lat=40.7128, long=-74.0060, distance=10)
120 |     params = await convert_query(query)
121 | 
122 |     assert "filter.geo" in params
123 |     assert params["filter.geo"] == ["distance(40.7128,-74.006,10mi)"]
124 | 
125 | 
126 | @pytest.mark.asyncio
127 | async def test_convert_query_study_type():
128 |     """Test conversion of study type to API format."""
129 |     query = TrialQuery(study_type=StudyType.INTERVENTIONAL)
130 |     params = await convert_query(query)
131 | 
132 |     assert "filter.advanced" in params
133 |     assert "AREA[StudyType]Interventional" in params["filter.advanced"][0]
134 | 
135 | 
136 | @pytest.mark.asyncio
137 | async def test_convert_query_phase():
138 |     """Test conversion of phase to API format."""
139 |     query = TrialQuery(phase=TrialPhase.PHASE3)
140 |     params = await convert_query(query)
141 | 
142 |     assert "filter.advanced" in params
143 |     assert "AREA[Phase]PHASE3" in params["filter.advanced"][0]
144 | 
145 | 
146 | @pytest.mark.asyncio
147 | async def test_convert_query_date_range():
148 |     """Test conversion of date range to API format."""
149 |     query = TrialQuery(
150 |         min_date="2020-01-01",
151 |         max_date="2020-12-31",
152 |         date_field=DateField.LAST_UPDATE,
153 |     )
154 |     params = await convert_query(query)
155 | 
156 |     assert "filter.advanced" in params
157 |     assert (
158 |         "AREA[LastUpdatePostDate]RANGE[2020-01-01,2020-12-31]"
159 |         in params["filter.advanced"][0]
160 |     )
161 | 
162 |     # Test min date only
163 |     query = TrialQuery(
164 |         min_date="2021-01-01",
165 |         date_field=DateField.STUDY_START,
166 |     )
167 |     params = await convert_query(query)
168 | 
169 |     assert "filter.advanced" in params
170 |     assert (
171 |         "AREA[StartDate]RANGE[2021-01-01,MAX]" in params["filter.advanced"][0]
172 |     )
173 | 
174 | 
175 | @pytest.mark.asyncio
176 | async def test_convert_query_sort_order():
177 |     """Test conversion of sort order to API format."""
178 |     query = TrialQuery(sort=SortOrder.RELEVANCE)
179 |     params = await convert_query(query)
180 | 
181 |     assert "sort" in params
182 |     assert params["sort"] == ["@relevance"]
183 | 
184 |     query = TrialQuery(sort=SortOrder.LAST_UPDATE)
185 |     params = await convert_query(query)
186 | 
187 |     assert "sort" in params
188 |     assert params["sort"] == ["LastUpdatePostDate:desc"]
189 | 
190 | 
191 | @pytest.mark.asyncio
192 | async def test_convert_query_intervention_type():
193 |     """Test conversion of intervention type to API format."""
194 |     query = TrialQuery(intervention_type=InterventionType.DRUG)
195 |     params = await convert_query(query)
196 | 
197 |     assert "filter.advanced" in params
198 |     assert "AREA[InterventionType]Drug" in params["filter.advanced"][0]
199 | 
200 | 
201 | @pytest.mark.asyncio
202 | async def test_convert_query_sponsor_type():
203 |     """Test conversion of sponsor type to API format."""
204 |     query = TrialQuery(sponsor_type=SponsorType.ACADEMIC)
205 |     params = await convert_query(query)
206 | 
207 |     assert "filter.advanced" in params
208 |     assert "AREA[SponsorType]Academic" in params["filter.advanced"][0]
209 | 
210 | 
211 | @pytest.mark.asyncio
212 | async def test_convert_query_study_design():
213 |     """Test conversion of study design to API format."""
214 |     query = TrialQuery(study_design=StudyDesign.RANDOMIZED)
215 |     params = await convert_query(query)
216 | 
217 |     assert "filter.advanced" in params
218 |     assert "AREA[StudyDesign]Randomized" in params["filter.advanced"][0]
219 | 
220 | 
221 | @pytest.mark.asyncio
222 | async def test_convert_query_age_group():
223 |     """Test conversion of age group to API format."""
224 |     query = TrialQuery(age_group=AgeGroup.ADULT)
225 |     params = await convert_query(query)
226 | 
227 |     assert "filter.advanced" in params
228 |     assert "AREA[StdAge]Adult" in params["filter.advanced"][0]
229 | 
230 | 
231 | @pytest.mark.asyncio
232 | async def test_convert_query_primary_purpose():
233 |     """Test conversion of primary purpose to API format."""
234 |     query = TrialQuery(primary_purpose=PrimaryPurpose.TREATMENT)
235 |     params = await convert_query(query)
236 | 
237 |     assert "filter.advanced" in params
238 |     assert (
239 |         "AREA[DesignPrimaryPurpose]Treatment" in params["filter.advanced"][0]
240 |     )
241 | 
242 | 
243 | @pytest.mark.asyncio
244 | async def test_convert_query_next_page_hash():
245 |     """Test conversion of next_page_hash to API format."""
246 |     query = TrialQuery(next_page_hash="abc123")
247 |     params = await convert_query(query)
248 | 
249 |     assert "pageToken" in params
250 |     assert params["pageToken"] == ["abc123"]
251 | 
252 | 
253 | @pytest.mark.asyncio
254 | async def test_convert_query_complex_parameters():
255 |     """Test conversion of multiple parameters to API format."""
256 |     query = TrialQuery(
257 |         conditions=["diabetes"],
258 |         terms=["obesity"],
259 |         interventions=["metformin"],
260 |         primary_purpose=PrimaryPurpose.TREATMENT,
261 |         study_type=StudyType.INTERVENTIONAL,
262 |         intervention_type=InterventionType.DRUG,
263 |         recruiting_status=RecruitingStatus.OPEN,
264 |         phase=TrialPhase.PHASE3,
265 |         age_group=AgeGroup.ADULT,
266 |         sort=SortOrder.RELEVANCE,
267 |     )
268 |     params = await convert_query(query)
269 | 
270 |     assert "query.cond" in params
271 |     # Disease synonym expansion may add synonyms to diabetes
272 |     assert "diabetes" in params["query.cond"][0]
273 |     assert "query.term" in params
274 |     assert params["query.term"] == ["obesity"]
275 |     assert "query.intr" in params
276 |     assert params["query.intr"] == ["metformin"]
277 |     assert "filter.advanced" in params
278 |     assert (
279 |         "AREA[DesignPrimaryPurpose]Treatment" in params["filter.advanced"][0]
280 |     )
281 |     assert "AREA[StudyType]Interventional" in params["filter.advanced"][0]
282 |     assert "AREA[InterventionType]Drug" in params["filter.advanced"][0]
283 |     assert "AREA[Phase]PHASE3" in params["filter.advanced"][0]
284 |     assert "AREA[StdAge]Adult" in params["filter.advanced"][0]
285 |     assert "filter.overallStatus" in params
286 |     assert "RECRUITING" in params["filter.overallStatus"][0]
287 |     assert "sort" in params
288 |     assert params["sort"] == ["@relevance"]
289 | 
290 | 
291 | # Test TrialQuery field validation for CLI input processing
292 | # noinspection PyTypeChecker
293 | def test_trial_query_field_validation_basic():
294 |     """Test basic field validation for TrialQuery."""
295 |     # Test list fields conversion
296 |     query = TrialQuery(conditions="diabetes")
297 |     assert query.conditions == ["diabetes"]
298 | 
299 |     query = TrialQuery(interventions="metformin")
300 |     assert query.interventions == ["metformin"]
301 | 
302 |     query = TrialQuery(terms="blood glucose")
303 |     assert query.terms == ["blood glucose"]
304 | 
305 |     query = TrialQuery(nct_ids="NCT01234567")
306 |     assert query.nct_ids == ["NCT01234567"]
307 | 
308 | 
309 | # noinspection PyTypeChecker
310 | def test_trial_query_field_validation_recruiting_status():
311 |     """Test recruiting status field validation."""
312 |     # Exact match uppercase
313 |     query = TrialQuery(recruiting_status="OPEN")
314 |     assert query.recruiting_status == RecruitingStatus.OPEN
315 | 
316 |     # Exact match lowercase
317 |     query = TrialQuery(recruiting_status="closed")
318 |     assert query.recruiting_status == RecruitingStatus.CLOSED
319 | 
320 |     # Invalid value
321 |     with pytest.raises(ValueError) as excinfo:
322 |         TrialQuery(recruiting_status="invalid")
323 |     assert "validation error for TrialQuery" in str(excinfo.value)
324 | 
325 | 
326 | # noinspection PyTypeChecker
327 | @pytest.mark.asyncio
328 | async def test_trial_query_field_validation_combined():
329 |     """Test combined parameters validation."""
330 |     query = TrialQuery(
331 |         conditions=["diabetes", "obesity"],
332 |         interventions="metformin",
333 |         recruiting_status="open",
334 |         study_type="interventional",
335 |         lat=40.7128,
336 |         long=-74.0060,
337 |         distance=10,
338 |     )
339 | 
340 |     assert query.conditions == ["diabetes", "obesity"]
341 |     assert query.interventions == ["metformin"]
342 |     assert query.recruiting_status == RecruitingStatus.OPEN
343 |     assert query.study_type == StudyType.INTERVENTIONAL
344 |     assert query.lat == 40.7128
345 |     assert query.long == -74.0060
346 |     assert query.distance == 10
347 | 
348 |     # Check that the query can be converted to parameters properly
349 |     params = await convert_query(query)
350 |     assert "query.cond" in params
351 |     # The query should contain the original terms, but may have expanded synonyms
352 |     cond_value = params["query.cond"][0]
353 |     assert "diabetes" in cond_value
354 |     assert "obesity" in cond_value
355 |     assert cond_value.startswith("(") and cond_value.endswith(")")
356 |     assert "query.intr" in params
357 |     assert "metformin" in params["query.intr"][0]
358 |     assert "filter.geo" in params
359 |     assert "distance(40.7128,-74.006,10mi)" in params["filter.geo"][0]
360 | 
361 | 
362 | # noinspection PyTypeChecker
363 | @pytest.mark.asyncio
364 | async def test_trial_query_field_validation_terms():
365 |     """Test terms parameter validation."""
366 |     # Single term as string
367 |     query = TrialQuery(terms="cancer")
368 |     assert query.terms == ["cancer"]
369 | 
370 |     # Multiple terms as list
371 |     query = TrialQuery(terms=["cancer", "therapy"])
372 |     assert query.terms == ["cancer", "therapy"]
373 | 
374 |     # Check parameter generation
375 |     params = await convert_query(query)
376 |     assert "query.term" in params
377 |     assert "(cancer OR therapy)" in params["query.term"][0]
378 | 
379 | 
380 | # noinspection PyTypeChecker
381 | @pytest.mark.asyncio
382 | async def test_trial_query_field_validation_nct_ids():
383 |     """Test NCT IDs parameter validation."""
384 |     # Single NCT ID
385 |     query = TrialQuery(nct_ids="NCT01234567")
386 |     assert query.nct_ids == ["NCT01234567"]
387 | 
388 |     # Multiple NCT IDs
389 |     query = TrialQuery(nct_ids=["NCT01234567", "NCT89012345"])
390 |     assert query.nct_ids == ["NCT01234567", "NCT89012345"]
391 | 
392 |     # Check parameter generation
393 |     params = await convert_query(query)
394 |     assert "query.id" in params
395 |     assert "NCT01234567,NCT89012345" in params["query.id"][0]
396 | 
397 | 
398 | # noinspection PyTypeChecker
399 | @pytest.mark.asyncio
400 | async def test_trial_query_field_validation_date_range():
401 |     """Test date range parameters validation."""
402 |     # Min date only with date field
403 |     query = TrialQuery(min_date="2020-01-01", date_field=DateField.STUDY_START)
404 |     assert query.min_date == "2020-01-01"
405 |     assert query.date_field == DateField.STUDY_START
406 | 
407 |     # Min and max date with date field using lazy mapping
408 |     query = TrialQuery(
409 |         min_date="2020-01-01",
410 |         max_date="2021-12-31",
411 |         date_field="last update",  # space not underscore.
412 |     )
413 |     assert query.min_date == "2020-01-01"
414 |     assert query.max_date == "2021-12-31"
415 |     assert query.date_field == DateField.LAST_UPDATE
416 | 
417 |     # Check parameter generation
418 |     params = await convert_query(query)
419 |     assert "filter.advanced" in params
420 |     assert (
421 |         "AREA[LastUpdatePostDate]RANGE[2020-01-01,2021-12-31]"
422 |         in params["filter.advanced"][0]
423 |     )
424 | 
425 | 
426 | # noinspection PyTypeChecker
427 | def test_trial_query_field_validation_primary_purpose():
428 |     """Test primary purpose parameter validation."""
429 |     # Exact match uppercase
430 |     query = TrialQuery(primary_purpose=PrimaryPurpose.TREATMENT)
431 |     assert query.primary_purpose == PrimaryPurpose.TREATMENT
432 | 
433 |     # Exact match lowercase
434 |     query = TrialQuery(primary_purpose=PrimaryPurpose.PREVENTION)
435 |     assert query.primary_purpose == PrimaryPurpose.PREVENTION
436 | 
437 |     # Case-insensitive
438 |     query = TrialQuery(primary_purpose="ScReeNING")
439 |     assert query.primary_purpose == PrimaryPurpose.SCREENING
440 | 
441 |     # Invalid
442 |     with pytest.raises(ValueError):
443 |         TrialQuery(primary_purpose="invalid")
444 | 
445 | 
446 | def test_inject_ids_with_many_ids_and_condition():
447 |     """Test _inject_ids function with 300 IDs and a condition to ensure filter.ids is used."""
448 |     # Create a params dict with a condition (indicating other filters present)
449 |     params = {
450 |         "query.cond": ["melanoma"],
451 |         "format": ["json"],
452 |         "markupFormat": ["markdown"],
453 |     }
454 | 
455 |     # Generate 300 NCT IDs
456 |     nct_ids = [f"NCT{str(i).zfill(8)}" for i in range(1, 301)]
457 | 
458 |     # Call _inject_ids with has_other_filters=True
459 |     _inject_ids(params, nct_ids, has_other_filters=True)
460 | 
461 |     # Assert that filter.ids is used (not query.id)
462 |     assert "filter.ids" in params
463 |     assert "query.id" not in params
464 | 
465 |     # Verify the IDs are properly formatted
466 |     ids_param = params["filter.ids"][0]
467 |     assert ids_param.startswith("NCT")
468 |     assert "NCT00000001" in ids_param
469 |     assert "NCT00000300" in ids_param
470 | 
471 |     # Verify it's a comma-separated list
472 |     assert "," in ids_param
473 |     assert ids_param.count(",") == 299  # 300 IDs = 299 commas
474 | 
475 | 
476 | def test_inject_ids_without_other_filters():
477 |     """Test _inject_ids function with only NCT IDs (no other filters)."""
478 |     # Create a minimal params dict
479 |     params = {
480 |         "format": ["json"],
481 |         "markupFormat": ["markdown"],
482 |     }
483 | 
484 |     # Use a small number of NCT IDs
485 |     nct_ids = ["NCT00000001", "NCT00000002", "NCT00000003"]
486 | 
487 |     # Call _inject_ids with has_other_filters=False
488 |     _inject_ids(params, nct_ids, has_other_filters=False)
489 | 
490 |     # Assert that query.id is used (not filter.ids) for small lists
491 |     assert "query.id" in params
492 |     assert "filter.ids" not in params
493 | 
494 |     # Verify the format
495 |     assert params["query.id"][0] == "NCT00000001,NCT00000002,NCT00000003"
496 | 
497 | 
498 | def test_inject_ids_large_list_without_filters():
499 |     """Test _inject_ids with a large ID list but no other filters."""
500 |     params = {
501 |         "format": ["json"],
502 |         "markupFormat": ["markdown"],
503 |     }
504 | 
505 |     # Generate enough IDs to exceed 1800 character limit
506 |     nct_ids = [f"NCT{str(i).zfill(8)}" for i in range(1, 201)]  # ~2200 chars
507 | 
508 |     # Call _inject_ids with has_other_filters=False
509 |     _inject_ids(params, nct_ids, has_other_filters=False)
510 | 
511 |     # Assert that filter.ids is used for large lists even without other filters
512 |     assert "filter.ids" in params
513 |     assert "query.id" not in params
514 | 
515 | 
516 | # Tests for new Essie builder functions
517 | def test_build_prior_therapy_essie():
518 |     """Test building Essie fragments for prior therapies."""
519 |     # Single therapy
520 |     fragments = _build_prior_therapy_essie(["osimertinib"])
521 |     assert len(fragments) == 1
522 |     assert (
523 |         fragments[0]
524 |         == 'AREA[EligibilityCriteria]("osimertinib" AND (prior OR previous OR received))'
525 |     )
526 | 
527 |     # Multiple therapies
528 |     fragments = _build_prior_therapy_essie(["osimertinib", "erlotinib"])
529 |     assert len(fragments) == 2
530 |     assert (
531 |         fragments[0]
532 |         == 'AREA[EligibilityCriteria]("osimertinib" AND (prior OR previous OR received))'
533 |     )
534 |     assert (
535 |         fragments[1]
536 |         == 'AREA[EligibilityCriteria]("erlotinib" AND (prior OR previous OR received))'
537 |     )
538 | 
539 |     # Empty strings are filtered out
540 |     fragments = _build_prior_therapy_essie(["osimertinib", "", "erlotinib"])
541 |     assert len(fragments) == 2
542 | 
543 | 
544 | def test_build_progression_essie():
545 |     """Test building Essie fragments for progression on therapy."""
546 |     fragments = _build_progression_essie(["pembrolizumab"])
547 |     assert len(fragments) == 1
548 |     assert (
549 |         fragments[0]
550 |         == 'AREA[EligibilityCriteria]("pembrolizumab" AND (progression OR resistant OR refractory))'
551 |     )
552 | 
553 | 
554 | def test_build_required_mutations_essie():
555 |     """Test building Essie fragments for required mutations."""
556 |     fragments = _build_required_mutations_essie(["EGFR L858R", "T790M"])
557 |     assert len(fragments) == 2
558 |     assert fragments[0] == 'AREA[EligibilityCriteria]("EGFR L858R")'
559 |     assert fragments[1] == 'AREA[EligibilityCriteria]("T790M")'
560 | 
561 | 
562 | def test_build_excluded_mutations_essie():
563 |     """Test building Essie fragments for excluded mutations."""
564 |     fragments = _build_excluded_mutations_essie(["KRAS G12C"])
565 |     assert len(fragments) == 1
566 |     assert fragments[0] == 'AREA[EligibilityCriteria](NOT "KRAS G12C")'
567 | 
568 | 
569 | def test_build_biomarker_expression_essie():
570 |     """Test building Essie fragments for biomarker expression."""
571 |     biomarkers = {"PD-L1": "≥50%", "TMB": "≥10 mut/Mb"}
572 |     fragments = _build_biomarker_expression_essie(biomarkers)
573 |     assert len(fragments) == 2
574 |     assert 'AREA[EligibilityCriteria]("PD-L1" AND "≥50%")' in fragments
575 |     assert 'AREA[EligibilityCriteria]("TMB" AND "≥10 mut/Mb")' in fragments
576 | 
577 |     # Empty values are filtered out
578 |     biomarkers = {"PD-L1": "≥50%", "TMB": "", "HER2": "positive"}
579 |     fragments = _build_biomarker_expression_essie(biomarkers)
580 |     assert len(fragments) == 2
581 | 
582 | 
583 | def test_build_line_of_therapy_essie():
584 |     """Test building Essie fragment for line of therapy."""
585 |     # First line
586 |     fragment = _build_line_of_therapy_essie(LineOfTherapy.FIRST_LINE)
587 |     assert (
588 |         fragment
589 |         == 'AREA[EligibilityCriteria]("first line" OR "first-line" OR "1st line" OR "frontline" OR "treatment naive" OR "previously untreated")'
590 |     )
591 | 
592 |     # Second line
593 |     fragment = _build_line_of_therapy_essie(LineOfTherapy.SECOND_LINE)
594 |     assert (
595 |         fragment
596 |         == 'AREA[EligibilityCriteria]("second line" OR "second-line" OR "2nd line" OR "one prior line" OR "1 prior line")'
597 |     )
598 | 
599 |     # Third line plus
600 |     fragment = _build_line_of_therapy_essie(LineOfTherapy.THIRD_LINE_PLUS)
601 |     assert (
602 |         fragment
603 |         == 'AREA[EligibilityCriteria]("third line" OR "third-line" OR "3rd line" OR "≥2 prior" OR "at least 2 prior" OR "heavily pretreated")'
604 |     )
605 | 
606 | 
607 | def test_build_brain_mets_essie():
608 |     """Test building Essie fragment for brain metastases filter."""
609 |     # Allow brain mets (no filter)
610 |     fragment = _build_brain_mets_essie(True)
611 |     assert fragment == ""
612 | 
613 |     # Exclude brain mets
614 |     fragment = _build_brain_mets_essie(False)
615 |     assert fragment == 'AREA[EligibilityCriteria](NOT "brain metastases")'
616 | 
617 | 
618 | @pytest.mark.asyncio
619 | async def test_convert_query_with_eligibility_fields():
620 |     """Test conversion of query with new eligibility-focused fields."""
621 |     query = TrialQuery(
622 |         conditions=["lung cancer"],
623 |         prior_therapies=["osimertinib"],
624 |         progression_on=["erlotinib"],
625 |         required_mutations=["EGFR L858R"],
626 |         excluded_mutations=["T790M"],
627 |         biomarker_expression={"PD-L1": "≥50%"},
628 |         line_of_therapy=LineOfTherapy.SECOND_LINE,
629 |         allow_brain_mets=False,
630 |     )
631 |     params = await convert_query(query)
632 | 
633 |     # Check that query.term contains all the Essie fragments
634 |     assert "query.term" in params
635 |     term = params["query.term"][0]
636 | 
637 |     # Prior therapy
638 |     assert (
639 |         'AREA[EligibilityCriteria]("osimertinib" AND (prior OR previous OR received))'
640 |         in term
641 |     )
642 | 
643 |     # Progression
644 |     assert (
645 |         'AREA[EligibilityCriteria]("erlotinib" AND (progression OR resistant OR refractory))'
646 |         in term
647 |     )
648 | 
649 |     # Required mutation
650 |     assert 'AREA[EligibilityCriteria]("EGFR L858R")' in term
651 | 
652 |     # Excluded mutation
653 |     assert 'AREA[EligibilityCriteria](NOT "T790M")' in term
654 | 
655 |     # Biomarker expression
656 |     assert 'AREA[EligibilityCriteria]("PD-L1" AND "≥50%")' in term
657 | 
658 |     # Line of therapy
659 |     assert 'AREA[EligibilityCriteria]("second line" OR "second-line"' in term
660 | 
661 |     # Brain mets exclusion
662 |     assert 'AREA[EligibilityCriteria](NOT "brain metastases")' in term
663 | 
664 |     # All fragments should be combined with AND
665 |     assert " AND " in term
666 | 
667 | 
668 | @pytest.mark.asyncio
669 | async def test_convert_query_with_custom_fields_and_page_size():
670 |     """Test conversion of query with custom return fields and page size."""
671 |     query = TrialQuery(
672 |         conditions=["diabetes"],
673 |         return_fields=["NCTId", "BriefTitle", "OverallStatus"],
674 |         page_size=100,
675 |     )
676 |     params = await convert_query(query)
677 | 
678 |     assert "fields" in params
679 |     assert params["fields"] == ["NCTId,BriefTitle,OverallStatus"]
680 | 
681 |     assert "pageSize" in params
682 |     assert params["pageSize"] == ["100"]
683 | 
684 | 
685 | @pytest.mark.asyncio
686 | async def test_convert_query_eligibility_with_existing_terms():
687 |     """Test that eligibility Essie fragments are properly combined with existing terms."""
688 |     query = TrialQuery(
689 |         terms=["immunotherapy"],
690 |         prior_therapies=["chemotherapy"],
691 |     )
692 |     params = await convert_query(query)
693 | 
694 |     assert "query.term" in params
695 |     term = params["query.term"][0]
696 | 
697 |     # Should contain both the original term and the new Essie fragment
698 |     assert "immunotherapy" in term
699 |     assert (
700 |         'AREA[EligibilityCriteria]("chemotherapy" AND (prior OR previous OR received))'
701 |         in term
702 |     )
703 |     # Should be combined with AND
704 |     assert "immunotherapy AND AREA[EligibilityCriteria]" in term
705 | 
```

--------------------------------------------------------------------------------
/tests/data/pubtator/pubtator3_paper.txt:
--------------------------------------------------------------------------------

```
  1 | Nucleic Acids Research, 2024, 52, W540–W546
  2 | https://doi.org/10.1093/nar/gkae235
  3 | Advance access publication date: 4 April 2024
  4 | Web Server issue
  5 | 
  6 | PubTator 3.0: an AI-powered literature resource for
  7 | unlocking biomedical knowledge
  8 | Chih-Hsuan Wei † , Alexis Allot † , Po-Ting Lai , Robert Leaman , Shubo Tian , Ling Luo ,
  9 | Qiao Jin , Zhizheng Wang , Qingyu Chen and Zhiyong Lu *
 10 | National Center for Biotechnology Information (NCBI), National Library of Medicine (NLM), National Institutes of Health (NIH),
 11 | Bethesda, MD 20894, USA
 12 | To whom correspondence should be addressed. Tel: +1 301 594 7089; Email: [email protected]
 13 | The first two authors should be regarded as Joint First Authors.
 14 | Present addresses:
 15 | Alexis Allot, The Neuro (Montreal Neurological Institute-Hospital), McGill University, Montreal, Quebec H3A 2B4, Canada.
 16 | Ling Luo, School of Computer Science and Technology, Dalian University of Technology, 116024 Dalian, China.
 17 | Qingyu Chen, Biomedical Informatics and Data Science, Yale School of Medicine, New Haven, CT 06510, USA.
 18 | †
 19 | 
 20 | Abstract
 21 | PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a biomedical literature resource using state-of-the-art AI techniques to offer
 22 | semantic and relation searches for key concepts like proteins, genetic variants, diseases and chemicals. It currently provides over one billion
 23 | entity and relation annotations across approximately 36 million PubMed abstracts and 6 million full-text articles from the PMC open access
 24 | subset, updated weekly. PubTator 3.0’s online interface and API utilize these precomputed entity relations and synonyms to provide advanced
 25 | search capabilities and enable large-scale analyses, streamlining many complex information needs. We showcase the retrieval quality of PubTator
 26 | 3.0 using a series of entity pair queries, demonstrating that PubTator 3.0 retrieves a greater number of articles than either PubMed or Google
 27 | Scholar, with higher precision in the top 20 results. We further show that integrating ChatGPT (GPT-4) with PubTator APIs dramatically improves
 28 | the factuality and verifiability of its responses. In summary, PubTator 3.0 offers a comprehensive set of features and tools that allow researchers
 29 | to navigate the ever-expanding wealth of biomedical literature, expediting research and unlocking valuable insights for scientific discovery.
 30 | 
 31 | Graphical abstract
 32 | 
 33 | Introduction
 34 | The biomedical literature is a primary resource to address information needs across the biological and clinical sciences (1),
 35 | however the requirements for literature search vary widely.
 36 | Activities such as formulating a research hypothesis require
 37 | an exploratory approach, whereas tasks like interpreting the
 38 | clinical significance of genetic variants are more focused.
 39 | Traditional keyword-based search methods have long
 40 | formed the foundation of biomedical literature search (2).
 41 | While generally effective for basic search, these methods also
 42 | have significant limitations, such as missing relevant articles
 43 | 
 44 | due to differing terminology or including irrelevant articles because surface-level term matches cannot adequately represent
 45 | the required association between query terms. These limitations cost time and risk information needs remaining unmet.
 46 | Natural language processing (NLP) methods provide substantial value for creating bioinformatics resources (3–5), and
 47 | may improve literature search by enabling semantic and relation search (6). In semantic search, users indicate specific
 48 | concepts of interest (entities) for which the system has precomputed matches regardless of the terminology used. Relation search increases precision by allowing users to specify the
 49 | 
 50 | Received: January 18, 2024. Revised: March 2, 2024. Editorial Decision: March 16, 2024. Accepted: March 21, 2024
 51 | Published by Oxford University Press on behalf of Nucleic Acids Research 2024.
 52 | This work is written by (a) US Government employee(s) and is in the public domain in the US.
 53 | 
 54 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
 55 | 
 56 | *
 57 | 
 58 | W541
 59 | 
 60 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
 61 | 
 62 | type of relationship desired between entities, such as whether
 63 | a chemical enhances or reduces expression of a gene. In this regard, we present PubTator 3.0, a novel resource engineered to
 64 | support semantic and relation search in the biomedical literature. Its search capabilities allow users to explore automated
 65 | entity annotations for six key biomedical entities: genes, diseases, chemicals, genetic variants, species, and cell lines. PubTator 3.0 also identifies and makes searchable 12 common
 66 | types of relations between entities, enhancing its utility for
 67 | both targeted and exploratory searches. Focusing on relations
 68 | and entity types of interest across the biomedical sciences allows PubTator 3.0 to retrieve information precisely while providing broad utility (see detailed comparisons with its predecessor in Supplementary Table S1).
 69 | 
 70 | The PubTator 3.0 online interface, illustrated in Figure 1
 71 | and Supplementary Figure S1, is designed for interactive literature exploration, supporting semantic, relation, keyword,
 72 | and Boolean queries. An auto-complete function provides semantic search suggestions to assist users with query formulation. For example, it automatically suggests replacing either ‘COVID-19 or "SARS-CoV-2 infection’ with the semantic term ‘@DISEASE_COVID_19 . Relation queries – new to
 73 | PubTator 3.0 – provide increased precision, allowing users
 74 | to target articles which discuss specific relationships between
 75 | entities.
 76 | PubTator 3.0 offers unified search results, simultaneously
 77 | searching approximately 36 million PubMed abstracts and
 78 | over 6 million full-text articles from the PMC Open Access Subset (PMC-OA), improving access to the substantial
 79 | amount of relevant information present in the article full text
 80 | (7). Search results are prioritized based on the depth of the relationship between the query terms: articles containing identifiable relations between semantic terms receive the highest
 81 | priority, while articles where semantic or keyword terms cooccur nearby (e.g. within the same sentence) receive secondary
 82 | priority. Search results are also prioritized based on the article
 83 | section where the match appears (e.g. matches within the title receive higher priority). Users can further refine results by
 84 | employing filters, narrowing articles returned to specific publication types, journals, or article sections.
 85 | PubTator 3.0 is supported by an NLP pipeline, depicted in
 86 | Figure 2A. This pipeline, run weekly, first identifies articles
 87 | newly added to PubMed and PMC-OA. Articles are then processed through three major steps: (i) named entity recognition,
 88 | provided by the recently developed deep-learning transformer
 89 | model AIONER (8), (ii) identifier mapping and (iii) relation
 90 | extraction, performed by BioREx (9) of 12 common types of
 91 | relations (described in Supplementary Table S2).
 92 | In total, PubTator 3.0 contains over 1.6 billion entity annotations (4.6 million unique identifiers) and 33 million relations
 93 | (8.8 million unique pairs). It provides enhanced entity recognition and normalization performance over its previous version,
 94 | PubTator 2 (10), also known as PubTator Central (Figure 2B
 95 | and Supplementary Table S3). We show the relation extraction performance of PubTator 3.0 in Figure 2C and its comparison results to the previous state-of-the-art systems (11–13)
 96 | on the BioCreative V Chemical-Disease Relation (14) corpus,
 97 | finding that PubTator 3.0 provided substantially higher accuracy. Moreover, when evaluating a randomized sample of
 98 | entity pair queries compared to PubMed and Google Scholar,
 99 | 
100 | Materials and methods
101 | Data sources and article processing
102 | PubTator 3.0 downloads new articles weekly from the BioC
103 | PubMed API (https://www.ncbi.nlm.nih.gov/research/bionlp/
104 | APIs/BioC-PubMed/) and the BioC PMC API (https://www.
105 | ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/) in BioCXML format (16). Local abbreviations are identified using
106 | Ab3P (17). Article text and extracted data are stored internally using MongoDB and indexed for search with Solr, ensuring robust and scalable accessibility unconstrained by external
107 | dependencies such as the NCBI eUtils API.
108 | 
109 | Entity recognition and normalization/linking
110 | PubTator 3.0 uses AIONER (8), a recently developed named
111 | entity recognition (NER) model, to recognize entities of six
112 | types: genes/proteins, chemicals, diseases, species, genetic
113 | variants, and cell lines. AIONER utilizes a flexible tagging
114 | scheme to integrate training data created separately into a
115 | single resource. These training datasets include NLM-Gene
116 | (18), NLM-Chem (19), NCBI-Disease (20), BC5CDR (14),
117 | tmVar3 (21), Species-800 (22), BioID (23) and BioRED (15).
118 | This consolidation creates a larger training set, improving
119 | the model’s ability to generalize to unseen data. Furthermore,
120 | it enables recognizing multiple entity types simultaneously,
121 | enhancing efficiency and simplifying the challenge of distinguishing boundaries between entities that reference others,
122 | such as the disorder ‘Alpha-1 antitrypsin deficiency’ and the
123 | protein ‘Alpha-1 antitrypsin’. We previously evaluated the performance of AIONER on 14 benchmark datasets (8), including the test sets for the aforementioned training sets. This evaluation demonstrated that AIONER’s performance surpasses
124 | or matches previous state-of-the-art methods.
125 | Entity mentions found by AIONER are normalized (linked)
126 | to a unique identifier in an appropriate entity database. Normalization is performed by a module designed for (or adapted
127 | to) each entity type, using the latest version. The recentlyupgraded GNorm2 system (24) normalizes genes to NCBI
128 | Gene identifiers and species mentions to NCBI Taxonomy.
129 | tmVar3 (21), also recently upgraded, normalizes genetic variants; it uses dbSNP identifiers for variants listed in dbSNP
130 | and HGNV format otherwise. Chemicals are normalized by
131 | the NLM-Chem tagger (19) to MeSH identifiers (25). TaggerOne (26) normalizes diseases to MeSH and cell lines to
132 | Cellosaurus (27) using a new normalization-only mode. This
133 | mode only applies the normalization model, which converts
134 | both mentions and lexicon names into high-dimensional TFIDF vectors and learns a mapping, as before. However, it
135 | now augments the training data by mapping each lexicon
136 | name to itself, resulting in a large performance improvement for names present in the lexicon but not in the annotated training data. These enhancements provide a significant overall improvement in entity normalization performance (Supplementary Table S3).
137 | 
138 | Relation extraction
139 | Relations for PubTator 3.0 are extracted by the unified relation extraction model BioREx (9), designed to simulta-
140 | 
141 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
142 | 
143 | System overview
144 | 
145 | PubTator 3.0 consistently returns a greater number of articles with higher precision in the top 20 results (Figure 2D and
146 | Supplementary Table S4).
147 | 
148 | W542
149 | 
150 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
151 | 
152 | neously extract 12 types of relations across eight entity
153 | type pairs: chemical–chemical, chemical–disease, chemical–
154 | gene, chemical–variant, disease–gene, disease–variant, gene–
155 | gene and variant–variant. Detailed definitions of these relation types and their corresponding entity pairs are presented in
156 | Supplementary Table S2. Deep-learning methods for relation
157 | extraction, such as BioREx, require ample training data. However, training data for relation extraction is fragmented into
158 | many datasets, often tailored to specific entity pairs. BioREx
159 | overcomes this limitation with a data-centric approach, reconciling discrepancies between disparate training datasets to
160 | construct a comprehensive, unified dataset.
161 | We evaluated the relations extracted by BioREx using performance on manually annotated relation extraction datasets
162 | as well as a comparative analysis between BioREx and notable
163 | comparable systems. BioREx established a new performance
164 | benchmark on the BioRED corpus test set (15), elevating the
165 | performance from 74.4% (F-score) to 79.6%, and demonstrating higher performance than alternative models such as
166 | transfer learning (TL), multi-task learning (MTL), and stateof-the-art models trained on isolated datasets (9). For PubTator 3.0, we replaced its deep learning module, PubMedBERT
167 | (28), with LinkBERT (29), further increasing the performance
168 | to 82.0%. Furthermore, we conducted a comparative analysis between BioREx and SemRep (11), a widely used rule-
169 | 
170 | based method for extracting diverse relations, the CD-REST
171 | (13) system, and the previous state-of-the-art system (12), using the BioCreative V Chemical Disease Relation corpus test
172 | set (14). Our evaluation demonstrated that PubTator 3.0 provided substantially higher F-score than previous methods.
173 | 
174 | Programmatic access and data formats
175 | PubTator 3.0 offers programmatic access through its
176 | API and bulk download. The API (https://www.ncbi.
177 | nlm.nih.gov/research/pubtator3/) supports keyword, entity and relation search, and also supports exporting
178 | annotations in XML and JSON-based BioC (16) formats and tab-delimited free text. The PubTator 3.0 FTP
179 | site (https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3) provides bulk downloads of annotated articles and extraction
180 | summaries for entities and relations. Programmatic access supports more flexible query options; for example,
181 | the information need ‘what chemicals reduce expression
182 | of JAK1?’ can be answered directly via API (e.g. https:
183 | //www.ncbi.nlm.nih.gov/research/pubtator3-api/relations?
184 | e1=@GENE_JAK1&type=negative_correlate&e2=Chemical)
185 | or by filtering the bulk relations file. Additionally, the PubTator 3.0 API supports annotation of user-defined free
186 | text.
187 | 
188 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
189 | 
190 | Figure 1. PubTator 3.0 system overview and search results page: 1. Query auto-complete enhances search accuracy and synonym matching. 2. Natural
191 | language processing (NLP)-enhanced relevance: Search results are prioritized according to the strength of the relationship between the entities queried.
192 | 3. Users can further refine results with facet filters—section, journal and type. 4. Search results include highlighted entity snippets explaining relevance.
193 | 5. Histogram visualizes number of results by publication year. 6. Entity highlighting can be switched on or off according to user preference.
194 | 
195 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
196 | 
197 | W543
198 | 
199 | Case study I: entity relation queries
200 | We analyzed the retrieval quality of PubTator 3.0 by preparing a series of 12 entity pairs to serve as case studies for
201 | comparison between PubTator 3.0, PubMed and Google
202 | Scholar. To provide an equal comparison, we filtered about
203 | 30% of the Google Scholar results for articles not present
204 | in PubMed. To ensure that the number of results would
205 | remain low enough to allow filtering Google Scholar results for articles not in PubMed, we identified entity pairs
206 | first discussed together in the literature in 2022 or later. We
207 | then randomly selected two entity pairs of each of the following types: disease/gene, chemical/disease, chemical/gene,
208 | chemical/chemical, gene/gene and disease/variant. None of
209 | 
210 | the relation pairs selected appears in the training set. The
211 | comparison was performed with respect to a snapshot of the
212 | search results returned by all search engines on 19 May 2023.
213 | We manually evaluated the top 20 results for each system and
214 | each query; articles were judged to be relevant if they mentioned both entities in the query and supported a relationship
215 | between them. Two curators independently judged each article, and discrepancies were discussed until agreement. The
216 | curators were not blinded to the retrieval method but were
217 | required to record the text supporting the relationship, if relevant. This experiment evaluated the relevance of the top 20
218 | results for each retrieval method, regardless of whether the
219 | article appeared in PubMed.
220 | 
221 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
222 | 
223 | Figure 2. (A) The PubTator 3.0 processing pipeline: AIONER (8) identifies six types of entities in PubMed abstracts and PMC-OA full-text articles. Entity
224 | annotations are associated with database identifiers by specialized mappers and BioREx (9) identifies relations between entities. Extracted data is
225 | stored in MongoDB and made searchable using Solr. (B) Entity recognition performance for each entity type compared with PubTator2 (also known as
226 | PubTatorCentral) (13) on the BioRED corpus (15). (C) Relation extraction performance compared with SemRep (11) and notable previous best systems
227 | (12,13) on the BioCreative V Chemical-Disease Relation (14) corpus. (D) Comparison of information retrieval for PubTator 3.0, PubMed, and Google
228 | Scholar for entity pair queries, with respect to total article count and top-20 article precision.
229 | 
230 | W544
231 | 
232 | Case study II: retrieval-augmented generation
233 | In the era of large language models (LLMs), PubTator 3.0 can
234 | also enhance their factual accuracy via retrieval augmented
235 | generation. Despite their strong language ability, LLMs are
236 | prone to generating incorrect assertions, sometimes known
237 | as hallucinations (30,31). For example, when requested to
238 | cite sources for questions such as ‘which diseases can doxorubicin treat’, GPT-4 frequently provides seemingly plausible but nonexistent references. Augmenting GPT-4 with PubTator 3.0 APIs can anchor the model’s response to verifiable
239 | references via the extracted relations, significantly reducing
240 | hallucinations.
241 | We assessed the citation accuracy of responses from three
242 | GPT-4 variations: PubTator-augmented GPT-4, PubMedaugmented GPT-4 and standard GPT-4. We performed a qualitative evaluation based on eight questions selected as follows. We identified entities mentioned in the PubMed query
243 | logs and randomly selected from entities searched both frequently and rarely. We then identified the common queries for
244 | each entity that request relational information and adapted
245 | one into a natural language question. Each question is therefore grounded on common information needs of real PubMed
246 | users. For example, the questions ‘What can be caused by
247 | tocilizumab?’ and ‘What can be treated by doxorubicin?’
248 | are adapted from the user queries ‘tocilizumab side effects’
249 | and ‘doxorubicin treatment’ respectively. Such questions typically require extracting information from multiple articles
250 | and an understanding of biomedical entities and relationship descriptions. Supplementary Table S5 lists the questions
251 | chosen.
252 | We augmented the GPT-4 large language model (LLM) with
253 | PubTator 3.0 via the function calling mechanism of the OpenAI ChatCompletion API. This integration involved prompt-
254 | 
255 | ing GPT-4 with descriptions of three PubTator APIs: (i) find
256 | entity ID, which retrieves PubTator entity identifiers; (ii) find
257 | related entities, which identifies related entities based on an
258 | input entity and specified relations and (iii) export relevant
259 | search results, which returns PubMed article identifiers containing textual evidence for specific entity relationships. Our
260 | instructions prompted GPT-4 to decompose user questions
261 | into sub-questions addressable by these APIs, execute the
262 | function calls, and synthesize the responses into a coherent final answer. Our prompt promoted a summarized response by
263 | instructing GPT-4 to start its message with ‘Summary:’ and requested the response include citations to the articles providing
264 | evidence. The PubMed augmentation experiments provided
265 | GPT-4 with access to PubMed database search via the National Center for Biotechnology Information (NCBI) E-utils
266 | APIs (32). We used Azure OpenAI Services (version 2023-0701-preview) and GPT-4 (version 2023-06-13) and set the decoding temperature to zero to obtain deterministic outputs.
267 | The full prompts are provided in Supplementary Table S6.
268 | PubTator-augmented GPT-4 generally processed the questions in three steps: (i) finding the standard entity identifiers, (ii) finding its related entity identifiers and (iii) searching PubMed articles. For example, to answer ‘What drugs can
269 | treat breast cancer?’, GPT-4 first found the PubTator entity
270 | identifier for breast cancer (@DISEASE_Breast_Cancer) using
271 | the Find Entity ID API. It then used the Find Related Entities
272 | API to identify entities related to @DISEASE_Breast_Cancer
273 | through a ‘treat’ relation. For demonstration purposes, we
274 | limited the maximum number of output entities to five. Finally,
275 | GPT-4 called the Export Relevant Search Results API for the
276 | PubMed article identifiers containing evidence for these relationships. The raw responses to each prompt for each method
277 | are provided in Supplementary Table S6.
278 | We manually evaluated the accuracy of the citations in
279 | the responses by reviewing each PubMed article and verifying whether each PubMed article cited supported the
280 | stated relationship (e.g. Tamoxifen treating breast cancer).
281 | Supplementary Table S5 reports the proportion of the cited
282 | articles with valid supporting evidence for each method. GPT4 frequently generated fabricated citations, widely known
283 | as the hallucination issue. While PubMed-augmented GPT-4
284 | showed a higher proportion of accurate citations, some articles cited did not support the relation claims. This is likely
285 | because PubMed is based on keyword and Boolean search and
286 | does not support queries for specific relationships. Responses
287 | generated by PubTator-augmented GPT-4 demonstrated the
288 | highest level of citation accuracy, underscoring the potential of PubTator 3.0 as a high-quality knowledge source for
289 | addressing biomedical information needs through retrievalaugmented generation with LLMs such as GPT-4. In our experiment, using Azure for ChatGPT, the cost was approximately $1 for two questions with GPT-4-Turbo, or 40 questions when downgraded to GPT-3.5-Turbo, including the cost
290 | of input/output tokens.
291 | 
292 | Discussion
293 | Previous versions of PubTator have fulfilled over one billion
294 | API requests since 2015, supporting a wide range of research
295 | applications. Numerous studies have harnessed PubTator annotations for disease-specific gene research, including efforts
296 | to prioritize candidate genes (33), determine gene–phenotype
297 | associations (34), and identify the genetic underpinnings of
298 | 
299 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
300 | 
301 | Our analysis is summarized in Figure 2D, and
302 | Supplementary Table S4 presents a detailed comparison
303 | of the quality of retrieved results between PubTator 3.0,
304 | PubMed and Google Scholar. Our results demonstrate that
305 | PubTator 3.0 retrieves a greater number of articles than the
306 | comparison systems and its precision is higher for the top
307 | 20 results. For instance, PubTator 3.0 returned 346 articles
308 | for the query ‘GLPG0634 + ulcerative colitis’, and manual
309 | review of the top 20 articles showed that all contained
310 | statements about an association between GLPG0634 and
311 | ulcerative colitis. In contrast, PubMed only returned a total
312 | of 18 articles, with only 12 mentioning an association. Moreover, when searching for ‘COVID19 + PON1’, PubTator 3.0
313 | returns 212 articles in PubMed, surpassing the 43 articles
314 | obtained from Google Scholar, only 29 of which are sourced
315 | from PubMed. These disparities can be attributed to several
316 | factors: (i) PubTator 3.0’s search includes full texts available
317 | in PMC-OA, resulting in significantly broader coverage of
318 | articles, (ii) entity normalization improves recall, for example,
319 | by matching ‘paraoxonase 1’ to ‘PON1’, (iii) PubTator 3.0
320 | prioritizes articles containing relations between the query
321 | entities, (iv) Pubtator 3.0 prioritizes articles where the entities
322 | appear nearby, rather than distant paragraphs. Across the 12
323 | information retrieval case studies, PubTator 3.0 demonstrated
324 | an overall precision of 90.0% for the top 20 articles (216 out
325 | of 240), which is significantly higher than PubMed’s precision
326 | of 81.6% (84 out of 103) and Google Scholar’s precision of
327 | 48.5% (98 out of 202).
328 | 
329 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
330 | 
331 | W545
332 | 
333 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
334 | 
335 | Conclusion
336 | PubTator 3.0 offers a comprehensive set of features and tools
337 | that allow researchers to navigate the ever-expanding wealth
338 | of biomedical literature, expediting research and unlocking
339 | valuable insights for scientific discovery. The PubTator 3.0 interface, API, and bulk file downloads are available at https:
340 | //www.ncbi.nlm.nih.gov/research/pubtator3/.
341 | 
342 | Data availability
343 | Data is available through the online interface at https://
344 | www.ncbi.nlm.nih.gov/research/pubtator3/, through the API
345 | at https://www.ncbi.nlm.nih.gov/research/pubtator3/api or
346 | bulk FTP download at https://ftp.ncbi.nlm.nih.gov/pub/lu/
347 | PubTator3/.
348 | The source code for each component of PubTator 3.0
349 | is openly accessible. The AIONER named entity recognizer
350 | is available at https://github.com/ncbi/AIONER. GNorm2,
351 | for gene name normalization, is available at https://github.
352 | com/ncbi/GNorm2. The tmVar3 variant name normalizer
353 | is available at https://github.com/ncbi/tmVar3. The NLMChem Tagger, for chemical name normalization, is available
354 | at https://ftp.ncbi.nlm.nih.gov/pub/lu/NLMChem. The TaggerOne system, for disease and cell line normalization, is available at https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/
355 | taggerone. The BioREx relation extraction system is available
356 | at https://github.com/ncbi/BioREx. The code for customizing
357 | ChatGPT with the PubTator 3.0 API is available at https:
358 | //github.com/ncbi-nlp/pubtator-gpt. The details of the applications, performance, evaluation data, and citations for each
359 | tool are shown in Supplementary Table S7. All source code is
360 | also available at https://doi.org/10.5281/zenodo.10839630.
361 | 
362 | Supplementary data
363 | Supplementary Data are available at NAR Online.
364 | 
365 | Funding
366 | Intramural Research Program of the National Library of
367 | Medicine (NLM), National Institutes of Health; ODSS Support of the Exploration of Cloud in NIH Intramural Research.
368 | Funding for open access charge: Intramural Research Program
369 | of the National Library of Medicine, National Institutes of
370 | Health.
371 | 
372 | Conflict of interest statement
373 | None declared.
374 | 
```

--------------------------------------------------------------------------------
/src/biomcp/utils/endpoint_registry.py:
--------------------------------------------------------------------------------

```python
  1 | """Registry for tracking all external HTTP endpoints used by BioMCP."""
  2 | 
  3 | from dataclasses import dataclass, field
  4 | from enum import Enum
  5 | from pathlib import Path
  6 | from typing import Any
  7 | from urllib.parse import urlparse
  8 | 
  9 | 
 10 | class EndpointCategory(str, Enum):
 11 |     """Categories of external endpoints."""
 12 | 
 13 |     BIOMEDICAL_LITERATURE = "biomedical_literature"
 14 |     CLINICAL_TRIALS = "clinical_trials"
 15 |     VARIANT_DATABASES = "variant_databases"
 16 |     CANCER_GENOMICS = "cancer_genomics"
 17 |     HEALTH_MONITORING = "health_monitoring"
 18 |     REGULATORY_DATA = "regulatory_data"
 19 | 
 20 | 
 21 | class DataType(str, Enum):
 22 |     """Types of data accessed from endpoints."""
 23 | 
 24 |     RESEARCH_ARTICLES = "research_articles"
 25 |     CLINICAL_TRIAL_DATA = "clinical_trial_data"
 26 |     GENETIC_VARIANTS = "genetic_variants"
 27 |     CANCER_MUTATIONS = "cancer_mutations"
 28 |     GENE_ANNOTATIONS = "gene_annotations"
 29 |     SERVICE_STATUS = "service_status"
 30 |     ADVERSE_EVENTS = "adverse_events"
 31 |     DRUG_LABELS = "drug_labels"
 32 |     DEVICE_EVENTS = "device_events"
 33 | 
 34 | 
 35 | @dataclass
 36 | class EndpointInfo:
 37 |     """Information about an external endpoint."""
 38 | 
 39 |     url: str
 40 |     category: EndpointCategory
 41 |     data_types: list[DataType] = field(default_factory=list)
 42 |     description: str = ""
 43 |     compliance_notes: str = ""
 44 |     rate_limit: str | None = None
 45 |     authentication: str | None = None
 46 | 
 47 |     @property
 48 |     def domain(self) -> str:
 49 |         """Extract domain from URL."""
 50 |         parsed = urlparse(self.url)
 51 |         return parsed.netloc
 52 | 
 53 | 
 54 | class EndpointRegistry:
 55 |     """Registry for tracking all external endpoints."""
 56 | 
 57 |     def __init__(self):
 58 |         self._endpoints: dict[str, EndpointInfo] = {}
 59 |         self._initialize_known_endpoints()
 60 | 
 61 |     def _initialize_known_endpoints(self):
 62 |         """Initialize registry with known endpoints."""
 63 |         # PubMed/PubTator3
 64 |         self.register(
 65 |             "pubtator3_search",
 66 |             EndpointInfo(
 67 |                 url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/",
 68 |                 category=EndpointCategory.BIOMEDICAL_LITERATURE,
 69 |                 data_types=[DataType.RESEARCH_ARTICLES],
 70 |                 description="PubTator3 API for searching biomedical literature with entity annotations",
 71 |                 compliance_notes="Public NIH/NCBI service, no PII transmitted",
 72 |                 rate_limit="20 requests/second",
 73 |             ),
 74 |         )
 75 | 
 76 |         self.register(
 77 |             "pubtator3_export",
 78 |             EndpointInfo(
 79 |                 url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson",
 80 |                 category=EndpointCategory.BIOMEDICAL_LITERATURE,
 81 |                 data_types=[DataType.RESEARCH_ARTICLES],
 82 |                 description="PubTator3 API for fetching full article annotations in BioC-JSON format",
 83 |                 compliance_notes="Public NIH/NCBI service, no PII transmitted",
 84 |                 rate_limit="20 requests/second",
 85 |             ),
 86 |         )
 87 | 
 88 |         self.register(
 89 |             "pubtator3_autocomplete",
 90 |             EndpointInfo(
 91 |                 url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/entity/autocomplete/",
 92 |                 category=EndpointCategory.BIOMEDICAL_LITERATURE,
 93 |                 data_types=[DataType.GENE_ANNOTATIONS],
 94 |                 description="PubTator3 API for entity name autocomplete suggestions",
 95 |                 compliance_notes="Public NIH/NCBI service, no PII transmitted",
 96 |                 rate_limit="20 requests/second",
 97 |             ),
 98 |         )
 99 | 
100 |         # ClinicalTrials.gov
101 |         self.register(
102 |             "clinicaltrials_search",
103 |             EndpointInfo(
104 |                 url="https://clinicaltrials.gov/api/v2/studies",
105 |                 category=EndpointCategory.CLINICAL_TRIALS,
106 |                 data_types=[DataType.CLINICAL_TRIAL_DATA],
107 |                 description="ClinicalTrials.gov API v2 for searching clinical trials",
108 |                 compliance_notes="Public NIH service, may contain trial participant criteria",
109 |                 rate_limit="10 requests/second",
110 |             ),
111 |         )
112 | 
113 |         # MyVariant.info
114 |         self.register(
115 |             "myvariant_query",
116 |             EndpointInfo(
117 |                 url="https://myvariant.info/v1/query",
118 |                 category=EndpointCategory.VARIANT_DATABASES,
119 |                 data_types=[DataType.GENETIC_VARIANTS],
120 |                 description="MyVariant.info API for querying genetic variants",
121 |                 compliance_notes="Public service aggregating variant databases, no patient data",
122 |                 rate_limit="1000 requests/hour (anonymous)",
123 |             ),
124 |         )
125 | 
126 |         self.register(
127 |             "myvariant_variant",
128 |             EndpointInfo(
129 |                 url="https://myvariant.info/v1/variant",
130 |                 category=EndpointCategory.VARIANT_DATABASES,
131 |                 data_types=[DataType.GENETIC_VARIANTS],
132 |                 description="MyVariant.info API for fetching specific variant details",
133 |                 compliance_notes="Public service aggregating variant databases, no patient data",
134 |                 rate_limit="1000 requests/hour (anonymous)",
135 |             ),
136 |         )
137 | 
138 |         # Preprint servers
139 |         self.register(
140 |             "biorxiv_api",
141 |             EndpointInfo(
142 |                 url="https://api.biorxiv.org/details/biorxiv",
143 |                 category=EndpointCategory.BIOMEDICAL_LITERATURE,
144 |                 data_types=[DataType.RESEARCH_ARTICLES],
145 |                 description="bioRxiv API for searching biology preprints",
146 |                 compliance_notes="Public preprint server, no PII transmitted",
147 |                 rate_limit="Not specified",
148 |             ),
149 |         )
150 | 
151 |         self.register(
152 |             "medrxiv_api",
153 |             EndpointInfo(
154 |                 url="https://api.biorxiv.org/details/medrxiv",
155 |                 category=EndpointCategory.BIOMEDICAL_LITERATURE,
156 |                 data_types=[DataType.RESEARCH_ARTICLES],
157 |                 description="medRxiv API for searching medical preprints",
158 |                 compliance_notes="Public preprint server, no PII transmitted",
159 |                 rate_limit="Not specified",
160 |             ),
161 |         )
162 | 
163 |         self.register(
164 |             "europe_pmc",
165 |             EndpointInfo(
166 |                 url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
167 |                 category=EndpointCategory.BIOMEDICAL_LITERATURE,
168 |                 data_types=[DataType.RESEARCH_ARTICLES],
169 |                 description="Europe PMC REST API for searching biomedical literature",
170 |                 compliance_notes="Public EMBL-EBI service, no PII transmitted",
171 |                 rate_limit="Not specified",
172 |             ),
173 |         )
174 | 
175 |         # External variant sources
176 |         self.register(
177 |             "gdc_ssms",
178 |             EndpointInfo(
179 |                 url="https://api.gdc.cancer.gov/ssms",
180 |                 category=EndpointCategory.VARIANT_DATABASES,
181 |                 data_types=[DataType.CANCER_MUTATIONS],
182 |                 description="NCI GDC API for somatic mutations",
183 |                 compliance_notes="Public NCI service, aggregate cancer genomics data",
184 |                 rate_limit="Not specified",
185 |             ),
186 |         )
187 | 
188 |         self.register(
189 |             "gdc_ssm_occurrences",
190 |             EndpointInfo(
191 |                 url="https://api.gdc.cancer.gov/ssm_occurrences",
192 |                 category=EndpointCategory.VARIANT_DATABASES,
193 |                 data_types=[DataType.CANCER_MUTATIONS],
194 |                 description="NCI GDC API for mutation occurrences in cancer samples",
195 |                 compliance_notes="Public NCI service, aggregate cancer genomics data",
196 |                 rate_limit="Not specified",
197 |             ),
198 |         )
199 | 
200 |         self.register(
201 |             "ensembl_variation",
202 |             EndpointInfo(
203 |                 url="https://rest.ensembl.org/variation/human",
204 |                 category=EndpointCategory.VARIANT_DATABASES,
205 |                 data_types=[DataType.GENETIC_VARIANTS],
206 |                 description="Ensembl REST API for human genetic variation data",
207 |                 compliance_notes="Public EMBL-EBI service, population genetics data",
208 |                 rate_limit="15 requests/second",
209 |             ),
210 |         )
211 | 
212 |         self.register(
213 |             "cbioportal_api",
214 |             EndpointInfo(
215 |                 url="https://www.cbioportal.org/api",
216 |                 category=EndpointCategory.CANCER_GENOMICS,
217 |                 data_types=[
218 |                     DataType.CANCER_MUTATIONS,
219 |                     DataType.CLINICAL_TRIAL_DATA,
220 |                 ],
221 |                 description="cBioPortal API for cancer genomics data",
222 |                 compliance_notes="Public MSKCC/Dana-Farber service, aggregate cancer genomics",
223 |                 rate_limit="5 requests/second",
224 |                 authentication="Optional API token for increased rate limits",
225 |             ),
226 |         )
227 | 
228 |         # Specific cBioPortal endpoints
229 |         self.register(
230 |             "cbioportal_genes",
231 |             EndpointInfo(
232 |                 url="https://www.cbioportal.org/api/genes",
233 |                 category=EndpointCategory.CANCER_GENOMICS,
234 |                 data_types=[DataType.GENE_ANNOTATIONS],
235 |                 description="cBioPortal API for gene information",
236 |                 compliance_notes="Public MSKCC/Dana-Farber service, gene metadata",
237 |                 rate_limit="5 requests/second",
238 |             ),
239 |         )
240 | 
241 |         self.register(
242 |             "cbioportal_cancer_types",
243 |             EndpointInfo(
244 |                 url="https://www.cbioportal.org/api/cancer-types",
245 |                 category=EndpointCategory.CANCER_GENOMICS,
246 |                 data_types=[DataType.CANCER_MUTATIONS],
247 |                 description="cBioPortal API for cancer type hierarchy",
248 |                 compliance_notes="Public MSKCC/Dana-Farber service, cancer type metadata",
249 |                 rate_limit="5 requests/second",
250 |             ),
251 |         )
252 | 
253 |         self.register(
254 |             "cbioportal_molecular_profiles",
255 |             EndpointInfo(
256 |                 url="https://www.cbioportal.org/api/molecular-profiles",
257 |                 category=EndpointCategory.CANCER_GENOMICS,
258 |                 data_types=[DataType.CANCER_MUTATIONS],
259 |                 description="cBioPortal API for molecular profiles",
260 |                 compliance_notes="Public MSKCC/Dana-Farber service, study metadata",
261 |                 rate_limit="5 requests/second",
262 |             ),
263 |         )
264 | 
265 |         self.register(
266 |             "cbioportal_mutations",
267 |             EndpointInfo(
268 |                 url="https://www.cbioportal.org/api/mutations",
269 |                 category=EndpointCategory.CANCER_GENOMICS,
270 |                 data_types=[DataType.CANCER_MUTATIONS],
271 |                 description="cBioPortal API for mutation data",
272 |                 compliance_notes="Public MSKCC/Dana-Farber service, aggregate mutation data",
273 |                 rate_limit="5 requests/second",
274 |             ),
275 |         )
276 | 
277 |         self.register(
278 |             "cbioportal_studies",
279 |             EndpointInfo(
280 |                 url="https://www.cbioportal.org/api/studies",
281 |                 category=EndpointCategory.CANCER_GENOMICS,
282 |                 data_types=[
283 |                     DataType.CLINICAL_TRIAL_DATA,
284 |                     DataType.CANCER_MUTATIONS,
285 |                 ],
286 |                 description="cBioPortal API for cancer studies",
287 |                 compliance_notes="Public MSKCC/Dana-Farber service, study metadata",
288 |                 rate_limit="5 requests/second",
289 |             ),
290 |         )
291 | 
292 |         # BioThings Suite APIs
293 |         self.register(
294 |             "mygene_query",
295 |             EndpointInfo(
296 |                 url="https://mygene.info/v3/query",
297 |                 category=EndpointCategory.VARIANT_DATABASES,
298 |                 data_types=[DataType.GENE_ANNOTATIONS],
299 |                 description="MyGene.info API for querying gene information",
300 |                 compliance_notes="Public BioThings service, gene annotation data",
301 |                 rate_limit="10 requests/second",
302 |             ),
303 |         )
304 | 
305 |         self.register(
306 |             "mygene_gene",
307 |             EndpointInfo(
308 |                 url="https://mygene.info/v3/gene",
309 |                 category=EndpointCategory.VARIANT_DATABASES,
310 |                 data_types=[DataType.GENE_ANNOTATIONS],
311 |                 description="MyGene.info API for fetching specific gene details",
312 |                 compliance_notes="Public BioThings service, gene annotation data",
313 |                 rate_limit="10 requests/second",
314 |             ),
315 |         )
316 | 
317 |         self.register(
318 |             "mydisease_query",
319 |             EndpointInfo(
320 |                 url="https://mydisease.info/v1/query",
321 |                 category=EndpointCategory.VARIANT_DATABASES,
322 |                 data_types=[DataType.GENE_ANNOTATIONS],
323 |                 description="MyDisease.info API for querying disease information",
324 |                 compliance_notes="Public BioThings service, disease ontology data",
325 |                 rate_limit="10 requests/second",
326 |             ),
327 |         )
328 | 
329 |         self.register(
330 |             "mydisease_disease",
331 |             EndpointInfo(
332 |                 url="https://mydisease.info/v1/disease",
333 |                 category=EndpointCategory.VARIANT_DATABASES,
334 |                 data_types=[DataType.GENE_ANNOTATIONS],
335 |                 description="MyDisease.info API for fetching specific disease details",
336 |                 compliance_notes="Public BioThings service, disease ontology data",
337 |                 rate_limit="10 requests/second",
338 |             ),
339 |         )
340 | 
341 |         self.register(
342 |             "mychem_query",
343 |             EndpointInfo(
344 |                 url="https://mychem.info/v1/query",
345 |                 category=EndpointCategory.VARIANT_DATABASES,
346 |                 data_types=[DataType.GENE_ANNOTATIONS],
347 |                 description="MyChem.info API for querying drug/chemical information",
348 |                 compliance_notes="Public BioThings service, drug/chemical annotation data",
349 |                 rate_limit="10 requests/second",
350 |             ),
351 |         )
352 | 
353 |         self.register(
354 |             "mychem_chem",
355 |             EndpointInfo(
356 |                 url="https://mychem.info/v1/chem",
357 |                 category=EndpointCategory.VARIANT_DATABASES,
358 |                 data_types=[DataType.GENE_ANNOTATIONS],
359 |                 description="MyChem.info API for fetching specific drug/chemical details",
360 |                 compliance_notes="Public BioThings service, drug/chemical annotation data",
361 |                 rate_limit="10 requests/second",
362 |             ),
363 |         )
364 | 
365 |         # NCI Clinical Trials Search API
366 |         self.register(
367 |             "nci_trials",
368 |             EndpointInfo(
369 |                 url="https://clinicaltrialsapi.cancer.gov/api/v2/trials",
370 |                 category=EndpointCategory.CLINICAL_TRIALS,
371 |                 data_types=[DataType.CLINICAL_TRIAL_DATA],
372 |                 description="NCI Clinical Trials Search API for cancer trials",
373 |                 compliance_notes="Public NCI service, cancer trial data",
374 |                 rate_limit="Not specified",
375 |                 authentication="Optional NCI_API_KEY for increased access",
376 |             ),
377 |         )
378 | 
379 |         self.register(
380 |             "nci_organizations",
381 |             EndpointInfo(
382 |                 url="https://clinicaltrialsapi.cancer.gov/api/v2/organizations",
383 |                 category=EndpointCategory.CLINICAL_TRIALS,
384 |                 data_types=[DataType.CLINICAL_TRIAL_DATA],
385 |                 description="NCI API for cancer research organizations",
386 |                 compliance_notes="Public NCI service, organization metadata",
387 |                 rate_limit="Not specified",
388 |                 authentication="Optional NCI_API_KEY for increased access",
389 |             ),
390 |         )
391 | 
392 |         self.register(
393 |             "nci_diseases",
394 |             EndpointInfo(
395 |                 url="https://clinicaltrialsapi.cancer.gov/api/v2/diseases",
396 |                 category=EndpointCategory.CLINICAL_TRIALS,
397 |                 data_types=[DataType.CLINICAL_TRIAL_DATA],
398 |                 description="NCI API for cancer disease vocabulary",
399 |                 compliance_notes="Public NCI service, disease ontology",
400 |                 rate_limit="Not specified",
401 |                 authentication="Optional NCI_API_KEY for increased access",
402 |             ),
403 |         )
404 | 
405 |         self.register(
406 |             "nci_interventions",
407 |             EndpointInfo(
408 |                 url="https://clinicaltrialsapi.cancer.gov/api/v2/interventions",
409 |                 category=EndpointCategory.CLINICAL_TRIALS,
410 |                 data_types=[DataType.CLINICAL_TRIAL_DATA],
411 |                 description="NCI API for cancer treatment interventions",
412 |                 compliance_notes="Public NCI service, intervention metadata",
413 |                 rate_limit="Not specified",
414 |                 authentication="Optional NCI_API_KEY for increased access",
415 |             ),
416 |         )
417 | 
418 |         self.register(
419 |             "nci_biomarkers",
420 |             EndpointInfo(
421 |                 url="https://clinicaltrialsapi.cancer.gov/api/v2/biomarkers",
422 |                 category=EndpointCategory.CLINICAL_TRIALS,
423 |                 data_types=[DataType.CLINICAL_TRIAL_DATA],
424 |                 description="NCI API for biomarkers used in clinical trials",
425 |                 compliance_notes="Public NCI service, biomarker metadata",
426 |                 rate_limit="Not specified",
427 |                 authentication="Optional NCI_API_KEY for increased access",
428 |             ),
429 |         )
430 | 
431 |         # OpenFDA APIs
432 |         self.register(
433 |             "openfda_drug_events",
434 |             EndpointInfo(
435 |                 url="https://api.fda.gov/drug/event.json",
436 |                 category=EndpointCategory.REGULATORY_DATA,
437 |                 data_types=[DataType.ADVERSE_EVENTS],
438 |                 description="FDA Adverse Event Reporting System (FAERS) for drug safety data",
439 |                 compliance_notes="Public FDA service, voluntary adverse event reports, no PII",
440 |                 rate_limit="40 requests/minute (240 with API key)",
441 |                 authentication="Optional OPENFDA_API_KEY for increased rate limits",
442 |             ),
443 |         )
444 | 
445 |         self.register(
446 |             "openfda_drug_labels",
447 |             EndpointInfo(
448 |                 url="https://api.fda.gov/drug/label.json",
449 |                 category=EndpointCategory.REGULATORY_DATA,
450 |                 data_types=[DataType.DRUG_LABELS],
451 |                 description="FDA Structured Product Labeling (SPL) for drug prescribing information",
452 |                 compliance_notes="Public FDA service, official drug labeling data",
453 |                 rate_limit="40 requests/minute (240 with API key)",
454 |                 authentication="Optional OPENFDA_API_KEY for increased rate limits",
455 |             ),
456 |         )
457 | 
458 |         self.register(
459 |             "openfda_device_events",
460 |             EndpointInfo(
461 |                 url="https://api.fda.gov/device/event.json",
462 |                 category=EndpointCategory.REGULATORY_DATA,
463 |                 data_types=[DataType.DEVICE_EVENTS],
464 |                 description="FDA MAUDE database for medical device adverse events",
465 |                 compliance_notes="Public FDA service, device malfunction and adverse event reports",
466 |                 rate_limit="40 requests/minute (240 with API key)",
467 |                 authentication="Optional OPENFDA_API_KEY for increased rate limits",
468 |             ),
469 |         )
470 | 
471 |         self.register(
472 |             "openfda_drugsfda",
473 |             EndpointInfo(
474 |                 url="https://api.fda.gov/drug/drugsfda.json",
475 |                 category=EndpointCategory.REGULATORY_DATA,
476 |                 data_types=[DataType.DRUG_LABELS],
477 |                 description="FDA Drugs@FDA database for drug approval information",
478 |                 compliance_notes="Public FDA service, official drug approval records",
479 |                 rate_limit="40 requests/minute (240 with API key)",
480 |                 authentication="Optional OPENFDA_API_KEY for increased rate limits",
481 |             ),
482 |         )
483 | 
484 |         self.register(
485 |             "openfda_drug_enforcement",
486 |             EndpointInfo(
487 |                 url="https://api.fda.gov/drug/enforcement.json",
488 |                 category=EndpointCategory.REGULATORY_DATA,
489 |                 data_types=[DataType.ADVERSE_EVENTS],
490 |                 description="FDA Enforcement database for drug recall information",
491 |                 compliance_notes="Public FDA service, drug recall and enforcement actions",
492 |                 rate_limit="40 requests/minute (240 with API key)",
493 |                 authentication="Optional OPENFDA_API_KEY for increased rate limits",
494 |             ),
495 |         )
496 | 
497 |         # Note: Drug shortage endpoint is not yet available via OpenFDA
498 |         # Using placeholder for future migration when FDA provides official endpoint
499 |         self.register(
500 |             "fda_drug_shortages",
501 |             EndpointInfo(
502 |                 url="https://www.fda.gov/media/169066/download",
503 |                 category=EndpointCategory.REGULATORY_DATA,
504 |                 data_types=[DataType.DRUG_LABELS],
505 |                 description="FDA Drug Shortages database (cached locally)",
506 |                 compliance_notes="Public FDA service, drug shortage status information",
507 |                 rate_limit="Cached with 24-hour TTL",
508 |                 authentication="None required",
509 |             ),
510 |         )
511 | 
512 |     def register(self, key: str, endpoint: EndpointInfo):
513 |         """Register an endpoint for tracking.
514 | 
515 |         Args:
516 |             key: Unique identifier for the endpoint
517 |             endpoint: Endpoint metadata including URL, description, and compliance notes
518 |         """
519 |         self._endpoints[key] = endpoint
520 | 
521 |     def get_all_endpoints(self) -> dict[str, EndpointInfo]:
522 |         """Get all registered endpoints.
523 | 
524 |         Returns:
525 |             Dictionary mapping endpoint keys to their metadata
526 |         """
527 |         return self._endpoints.copy()
528 | 
529 |     def get_endpoints_by_category(
530 |         self, category: EndpointCategory
531 |     ) -> dict[str, EndpointInfo]:
532 |         """Get endpoints filtered by category.
533 | 
534 |         Args:
535 |             category: The category to filter by
536 | 
537 |         Returns:
538 |             Dictionary of endpoints belonging to the specified category
539 |         """
540 |         return {
541 |             key: info
542 |             for key, info in self._endpoints.items()
543 |             if info.category == category
544 |         }
545 | 
546 |     def get_unique_domains(self) -> set[str]:
547 |         """Get all unique domains accessed by BioMCP.
548 | 
549 |         Returns:
550 |             Set of unique domain names (e.g., 'api.ncbi.nlm.nih.gov')
551 |         """
552 |         return {info.domain for info in self._endpoints.values()}
553 | 
554 |     def generate_markdown_report(self) -> str:
555 |         """Generate markdown documentation of all endpoints."""
556 |         lines = [
557 |             "# Third-Party Endpoints Used by BioMCP",
558 |             "",
559 |             "_This file is auto-generated from the endpoint registry._",
560 |             "",
561 |             "## Overview",
562 |             "",
563 |             f"BioMCP connects to {len(self.get_unique_domains())} external domains across {len(self._endpoints)} endpoints.",
564 |             "",
565 |             "## Endpoints by Category",
566 |             "",
567 |         ]
568 | 
569 |         # Group by category
570 |         for category in EndpointCategory:
571 |             endpoints = self.get_endpoints_by_category(category)
572 |             if not endpoints:
573 |                 continue
574 | 
575 |             lines.append(f"### {category.value.replace('_', ' ').title()}")
576 |             lines.append("")
577 | 
578 |             for key, info in sorted(endpoints.items()):
579 |                 lines.append(f"#### {key}")
580 |                 lines.append("")
581 |                 lines.append(f"- **URL**: `{info.url}`")
582 |                 lines.append(f"- **Description**: {info.description}")
583 |                 lines.append(
584 |                     f"- **Data Types**: {', '.join(dt.value for dt in info.data_types)}"
585 |                 )
586 |                 lines.append(
587 |                     f"- **Rate Limit**: {info.rate_limit or 'Not specified'}"
588 |                 )
589 | 
590 |                 if info.authentication:
591 |                     lines.append(
592 |                         f"- **Authentication**: {info.authentication}"
593 |                     )
594 | 
595 |                 if info.compliance_notes:
596 |                     lines.append(
597 |                         f"- **Compliance Notes**: {info.compliance_notes}"
598 |                     )
599 | 
600 |                 lines.append("")
601 | 
602 |         # Add summary section
603 |         lines.extend([
604 |             "## Domain Summary",
605 |             "",
606 |             "| Domain               | Category              | Endpoints |",
607 |             "| -------------------- | --------------------- | --------- |",
608 |         ])
609 | 
610 |         domain_stats: dict[str, dict[str, Any]] = {}
611 |         for info in self._endpoints.values():
612 |             domain = info.domain
613 |             if domain not in domain_stats:
614 |                 domain_stats[domain] = {
615 |                     "category": info.category.value,
616 |                     "count": 0,
617 |                 }
618 |             domain_stats[domain]["count"] = (
619 |                 int(domain_stats[domain]["count"]) + 1
620 |             )
621 | 
622 |         for domain, stats in sorted(domain_stats.items()):
623 |             lines.append(
624 |                 f"| {domain} | {stats['category']} | {stats['count']} |"
625 |             )
626 | 
627 |         lines.extend([
628 |             "",
629 |             "## Compliance and Privacy",
630 |             "",
631 |             "All endpoints accessed by BioMCP:",
632 |             "",
633 |             "- Use publicly available APIs",
634 |             "- Do not transmit personally identifiable information (PII)",
635 |             "- Access only aggregate or de-identified data",
636 |             "- Comply with respective terms of service",
637 |             "",
638 |             "## Network Control",
639 |             "",
640 |             "For air-gapped or restricted environments, BioMCP supports:",
641 |             "",
642 |             "- Offline mode via `BIOMCP_OFFLINE=true` environment variable",
643 |             "- Custom proxy configuration via standard HTTP(S)\\_PROXY variables",
644 |             "- SSL certificate pinning for enhanced security",
645 |             "",
646 |         ])
647 | 
648 |         return "\n".join(lines)
649 | 
650 |     def save_markdown_report(self, output_path: Path | None = None):
651 |         """Save markdown report to file."""
652 |         if output_path is None:
653 |             output_path = (
654 |                 Path(__file__).parent.parent.parent
655 |                 / "THIRD_PARTY_ENDPOINTS.md"
656 |             )
657 | 
658 |         output_path.write_text(self.generate_markdown_report())
659 |         return output_path
660 | 
661 | 
662 | # Global registry instance
663 | _registry = EndpointRegistry()
664 | 
665 | 
666 | def get_registry() -> EndpointRegistry:
667 |     """Get the global endpoint registry."""
668 |     return _registry
669 | 
```

--------------------------------------------------------------------------------
/tests/data/ct_gov/clinical_trials_api_v2.yaml:
--------------------------------------------------------------------------------

```yaml
  1 | openapi: "3.0.3"
  2 | info:
  3 |   title: "ClinicalTrials.gov REST API"
  4 |   description:
  5 |     "This API is made available to provide users meta data, statistics,\
  6 |     \ and the most recent version of the clinical trials available on ClinicalTrials.gov."
  7 |   version: "2.0.3"
  8 | tags:
  9 |   - name: "Studies"
 10 |     description: "Related to clinical trial studies"
 11 |   - name: "Stats"
 12 |     description: "Data statistics"
 13 |   - name: "Version"
 14 |     description: "Version info"
 15 | servers:
 16 |   - url: "https://clinicaltrials.gov/api/v2"
 17 |     description: "This server"
 18 | paths:
 19 |   /studies:
 20 |     get:
 21 |       summary: "Studies"
 22 |       description:
 23 |         "Returns data of studies matching query and filter parameters.\
 24 |         \ The studies are returned page by page.\nIf response contains `nextPageToken`,\
 25 |         \ use its value in `pageToken` to get next page.\nThe last page will not contain\
 26 |         \ `nextPageToken`. A page may have empty `studies` array.\nRequest for each\
 27 |         \ subsequent page **must** have the same parameters as for the first page,\
 28 |         \ except\n`countTotal`, `pageSize`, and `pageToken` parameters.\n\nIf neither\
 29 |         \ queries nor filters are set, all studies will be returned.\nIf any query\
 30 |         \ parameter contains only NCT IDs (comma- and/or space-separated), filters\
 31 |         \ are ignored.\n\n`query.*` parameters are in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\n\
 32 |         Those parameters affect ranking of studies, if sorted by relevance. See `sort`\
 33 |         \ parameter for details.\n\n`filter.*` and `postFilter.*` parameters have\
 34 |         \ same effect as there is no aggregation calculation. \nBoth are available\
 35 |         \ just to simplify applying parameters from search request.\nBoth do not affect\
 36 |         \ ranking of studies.\n\nNote: When trying JSON format in your browser, do\
 37 |         \ not set too large `pageSize` parameter, if `fields` is\nunlimited. That\
 38 |         \ may return too much data for the browser to parse and render."
 39 |       tags:
 40 |         - "Studies"
 41 |       operationId: "listStudies"
 42 |       parameters:
 43 |         - name: "format"
 44 |           in: "query"
 45 |           description:
 46 |             "Must be one of the following:\n* `csv`- return CSV table with\
 47 |             \ one page of study data; first page will contain header with column names;\
 48 |             \ available fields are listed on [CSV Download](/data-api/about-api/csv-download)\
 49 |             \ page\n* `json`- return JSON with one page of study data; every study object\
 50 |             \ is placed in a separate line; `markup` type fields format depends on `markupFormat`\
 51 |             \ parameter"
 52 |           required: false
 53 |           schema:
 54 |             type: "string"
 55 |             enum:
 56 |               - "csv"
 57 |               - "json"
 58 |             default: "json"
 59 |         - name: "markupFormat"
 60 |           in: "query"
 61 |           description:
 62 |             "Format of `markup` type fields:\n* `markdown`- [markdown](https://spec.commonmark.org/0.28/)\
 63 |             \ format\n* `legacy`- compatible with classic PRS\n\nApplicable only to\
 64 |             \ `json` format."
 65 |           required: false
 66 |           schema:
 67 |             type: "string"
 68 |             enum:
 69 |               - "markdown"
 70 |               - "legacy"
 71 |             default: "markdown"
 72 |         - name: "query.cond"
 73 |           in: "query"
 74 |           description:
 75 |             "\"Conditions or disease\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
 76 |             \ See \"ConditionSearch Area\" on [Search Areas](/data-api/about-api/search-areas#ConditionSearch)\
 77 |             \ for more details."
 78 |           required: false
 79 |           schema:
 80 |             type: "string"
 81 |           examples:
 82 |             example1:
 83 |               value: "lung cancer"
 84 |             example2:
 85 |               value: "(head OR neck) AND pain"
 86 |         - name: "query.term"
 87 |           in: "query"
 88 |           description:
 89 |             "\"Other terms\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
 90 |             \ See \"BasicSearch Area\" on [Search Areas](/data-api/about-api/search-areas#BasicSearch)\
 91 |             \ for more details."
 92 |           required: false
 93 |           schema:
 94 |             type: "string"
 95 |           examples:
 96 |             example1:
 97 |               value: "AREA[LastUpdatePostDate]RANGE[2023-01-15,MAX]"
 98 |         - name: "query.locn"
 99 |           in: "query"
100 |           description:
101 |             "\"Location terms\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
102 |             \ See \"LocationSearch Area\" on [Search Areas](/data-api/about-api/search-areas#LocationSearch)\
103 |             \ for more details."
104 |           required: false
105 |           schema:
106 |             type: "string"
107 |         - name: "query.titles"
108 |           in: "query"
109 |           description:
110 |             "\"Title / acronym\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
111 |             \ See \"TitleSearch Area\" on [Search Areas](/data-api/about-api/search-areas#TitleSearch)\
112 |             \ for more details."
113 |           required: false
114 |           schema:
115 |             type: "string"
116 |         - name: "query.intr"
117 |           in: "query"
118 |           description:
119 |             "\"Intervention / treatment\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
120 |             \ See \"InterventionSearch Area\" on [Search Areas](/data-api/about-api/search-areas#InterventionSearch)\
121 |             \ for more details."
122 |           required: false
123 |           schema:
124 |             type: "string"
125 |         - name: "query.outc"
126 |           in: "query"
127 |           description:
128 |             "\"Outcome measure\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
129 |             \ See \"OutcomeSearch Area\" on [Search Areas](/data-api/about-api/search-areas#OutcomeSearch)\
130 |             \ for more details."
131 |           required: false
132 |           schema:
133 |             type: "string"
134 |         - name: "query.spons"
135 |           in: "query"
136 |           description:
137 |             "\"Sponsor / collaborator\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
138 |             \ See \"SponsorSearch Area\" on [Search Areas](/data-api/about-api/search-areas#SponsorSearch)\
139 |             \ for more details."
140 |           required: false
141 |           schema:
142 |             type: "string"
143 |         - name: "query.lead"
144 |           in: "query"
145 |           description:
146 |             "Searches in \"LeadSponsorName\" field. See [Study Data Structure](/data-api/about-api/study-data-structure#LeadSponsorName)\
147 |             \ for more details. The query is in [Essie expression syntax](/find-studies/constructing-complex-search-queries)."
148 |           required: false
149 |           schema:
150 |             type: "string"
151 |         - name: "query.id"
152 |           in: "query"
153 |           description:
154 |             "\"Study IDs\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
155 |             \ See \"IdSearch Area\" on [Search Areas](/data-api/about-api/search-areas#IdSearch)\
156 |             \ for more details."
157 |           required: false
158 |           schema:
159 |             type: "string"
160 |         - name: "query.patient"
161 |           in: "query"
162 |           description:
163 |             "See \"PatientSearch Area\" on [Search Areas](/data-api/about-api/search-areas#PatientSearch)\
164 |             \ for more details."
165 |           required: false
166 |           schema:
167 |             type: "string"
168 |         - name: "filter.overallStatus"
169 |           in: "query"
170 |           style: "pipeDelimited"
171 |           explode: false
172 |           description: "Filter by comma- or pipe-separated list of statuses"
173 |           required: false
174 |           schema:
175 |             type: "array"
176 |             items:
177 |               $ref: "#/components/schemas/Status"
178 |           examples:
179 |             example1:
180 |               value:
181 |                 - "NOT_YET_RECRUITING"
182 |                 - "RECRUITING"
183 |             example2:
184 |               value:
185 |                 - "COMPLETED"
186 |         - name: "filter.geo"
187 |           in: "query"
188 |           description:
189 |             "Filter by geo-function. Currently only distance function is\
190 |             \ supported.\nFormat: `distance(latitude,longitude,distance)`"
191 |           required: false
192 |           schema:
193 |             type: "string"
194 |             pattern:
195 |               "^distance\\(-?\\d+(\\.\\d+)?,-?\\d+(\\.\\d+)?,\\d+(\\.\\d+)?(km|mi)?\\\
196 |               )$"
197 |           examples:
198 |             example1:
199 |               value: "distance(39.0035707,-77.1013313,50mi)"
200 |         - name: "filter.ids"
201 |           in: "query"
202 |           style: "pipeDelimited"
203 |           explode: false
204 |           description:
205 |             "Filter by comma- or pipe-separated list of NCT IDs (a.k.a. ClinicalTrials.gov\
206 |             \ identifiers).\nThe provided IDs will be searched in [NCTId](data-api/about-api/study-data-structure#NCTId)\
207 |             \ and\n[NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\
208 |             \ fields."
209 |           required: false
210 |           schema:
211 |             type: "array"
212 |             items:
213 |               type: "string"
214 |               pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$"
215 |           examples:
216 |             example1:
217 |               value:
218 |                 - "NCT04852770"
219 |                 - "NCT01728545"
220 |                 - "NCT02109302"
221 |         - name: "filter.advanced"
222 |           in: "query"
223 |           description: "Filter by query in [Essie expression syntax](/find-studies/constructing-complex-search-queries)"
224 |           required: false
225 |           schema:
226 |             type: "string"
227 |           examples:
228 |             example1:
229 |               value: "AREA[StartDate]2022"
230 |             example2:
231 |               value:
232 |                 "AREA[MinimumAge]RANGE[MIN, 16 years] AND AREA[MaximumAge]RANGE[16\
233 |                 \ years, MAX]"
234 |         - name: "filter.synonyms"
235 |           in: "query"
236 |           style: "pipeDelimited"
237 |           explode: false
238 |           description:
239 |             "Filter by comma- or pipe-separated list of `area`:`synonym_id`\
240 |             \ pairs"
241 |           required: false
242 |           schema:
243 |             type: "array"
244 |             items:
245 |               type: "string"
246 |           examples:
247 |             example1:
248 |               value:
249 |                 - "ConditionSearch:1651367"
250 |                 - "BasicSearch:2013558"
251 |         - name: "postFilter.overallStatus"
252 |           in: "query"
253 |           style: "pipeDelimited"
254 |           explode: false
255 |           description: "Filter by comma- or pipe-separated list of statuses"
256 |           required: false
257 |           schema:
258 |             type: "array"
259 |             items:
260 |               $ref: "#/components/schemas/Status"
261 |           examples:
262 |             example1:
263 |               value:
264 |                 - "NOT_YET_RECRUITING"
265 |                 - "RECRUITING"
266 |             example2:
267 |               value:
268 |                 - "COMPLETED"
269 |         - name: "postFilter.geo"
270 |           in: "query"
271 |           description:
272 |             "Filter by geo-function. Currently only distance function is\
273 |             \ supported.\nFormat: `distance(latitude,longitude,distance)`"
274 |           required: false
275 |           schema:
276 |             type: "string"
277 |             pattern:
278 |               "^distance\\(-?\\d+(\\.\\d+)?,-?\\d+(\\.\\d+)?,\\d+(\\.\\d+)?(km|mi)?\\\
279 |               )$"
280 |           examples:
281 |             example1:
282 |               value: "distance(39.0035707,-77.1013313,50mi)"
283 |         - name: "postFilter.ids"
284 |           in: "query"
285 |           style: "pipeDelimited"
286 |           explode: false
287 |           description:
288 |             "Filter by comma- or pipe-separated list of NCT IDs (a.k.a. ClinicalTrials.gov\
289 |             \ identifiers).\nThe provided IDs will be searched in [NCTId](data-api/about-api/study-data-structure#NCTId)\
290 |             \ and\n[NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\
291 |             \ fields."
292 |           required: false
293 |           schema:
294 |             type: "array"
295 |             items:
296 |               type: "string"
297 |               pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$"
298 |           examples:
299 |             example1:
300 |               value:
301 |                 - "NCT04852770"
302 |                 - "NCT01728545"
303 |                 - "NCT02109302"
304 |         - name: "postFilter.advanced"
305 |           in: "query"
306 |           description: "Filter by query in [Essie expression syntax](/find-studies/constructing-complex-search-queries)"
307 |           required: false
308 |           schema:
309 |             type: "string"
310 |           examples:
311 |             example1:
312 |               value: "AREA[StartDate]2022"
313 |             example2:
314 |               value:
315 |                 "AREA[MinimumAge]RANGE[MIN, 16 years] AND AREA[MaximumAge]RANGE[16\
316 |                 \ years, MAX]"
317 |         - name: "postFilter.synonyms"
318 |           in: "query"
319 |           style: "pipeDelimited"
320 |           explode: false
321 |           description:
322 |             "Filter by comma- or pipe-separated list of `area`:`synonym_id`\
323 |             \ pairs"
324 |           required: false
325 |           schema:
326 |             type: "array"
327 |             items:
328 |               type: "string"
329 |           examples:
330 |             example1:
331 |               value:
332 |                 - "ConditionSearch:1651367"
333 |                 - "BasicSearch:2013558"
334 |         - name: "aggFilters"
335 |           in: "query"
336 |           description:
337 |             "Apply aggregation filters, aggregation counts will not be provided.\n\
338 |             The value is comma- or pipe-separated list of pairs `filter_id`:`space-separated\
339 |             \ list of option keys` for the checked options."
340 |           required: false
341 |           schema:
342 |             type: "string"
343 |           examples:
344 |             example1:
345 |               value: "results:with,status:com"
346 |             example2:
347 |               value: "status:not rec,sex:f,healthy:y"
348 |         - name: "geoDecay"
349 |           in: "query"
350 |           description:
351 |             "Set proximity factor by distance from `filter.geo` location\
352 |             \ to the closest [LocationGeoPoint](/data-api/about-api/study-data-structure#LocationGeoPoint)\
353 |             \ of a study.\nIgnored, if `filter.geo` parameter is not set or response\
354 |             \ contains more than 10,000 studies."
355 |           required: false
356 |           schema:
357 |             type: "string"
358 |             pattern:
359 |               "^func:(gauss|exp|linear),scale:(\\d+(\\.\\d+)?(km|mi)),offset:(\\\
360 |               d+(\\.\\d+)?(km|mi)),decay:(\\d+(\\.\\d+)?)$"
361 |             default: "func:exp,scale:300mi,offset:0mi,decay:0.5"
362 |           examples:
363 |             example1:
364 |               value: "func:linear,scale:100km,offset:10km,decay:0.1"
365 |             example2:
366 |               value: "func:gauss,scale:500mi,offset:0mi,decay:0.3"
367 |         - name: "fields"
368 |           in: "query"
369 |           style: "pipeDelimited"
370 |           explode: false
371 |           description:
372 |             "If specified, must be non-empty comma- or pipe-separated list\
373 |             \ of fields to return. If unspecified, all fields will be returned.\nOrder\
374 |             \ of the fields does not matter.\n\nFor `csv` format, specify list of columns.\
375 |             \ The column names are available on [CSV Download](/data-api/about-api/csv-download).\n\
376 |             \nFor `json` format, every list item is either area name, piece name, field\
377 |             \ name, or special name.\nIf a piece or a field is a branch node, all descendant\
378 |             \ fields will be included.\nAll area names are available on [Search Areas](/data-api/about-api/search-areas),\n\
379 |             the piece and field names — on [Data Structure](/data-api/about-api/study-data-structure)\
380 |             \ and also can be retrieved at `/studies/metadata` endpoint.\nThere is a\
381 |             \ special name, `@query`, which expands to all fields queried by search."
382 |           required: false
383 |           schema:
384 |             type: "array"
385 |             minItems: 1
386 |             items:
387 |               type: "string"
388 |               pattern: "^([a-zA-Z][a-zA-Z0-9\\-. ]*)|(@query)$"
389 |           examples:
390 |             example1:
391 |               value:
392 |                 - "NCTId"
393 |                 - "BriefTitle"
394 |                 - "OverallStatus"
395 |                 - "HasResults"
396 |             example2:
397 |               value: "ProtocolSection"
398 |         - name: "sort"
399 |           in: "query"
400 |           style: "pipeDelimited"
401 |           explode: false
402 |           description:
403 |             "Comma- or pipe-separated list of sorting options of the studies.\
404 |             \ The returning studies are not sorted by default for a performance reason.\n\
405 |             Every list item contains a field/piece name and an optional sort direction\
406 |             \ (`asc` for ascending or `desc` for descending)\nafter colon character.\n\
407 |             \nAll piece and field names can be found on [Data Structure](/data-api/about-api/study-data-structure)\
408 |             \ and also can be retrieved\nat `/studies/metadata` endpoint. Currently,\
409 |             \ only date and numeric fields are allowed for sorting.\nThere is a special\
410 |             \ \"field\" `@relevance` to sort by relevance to a search query.\n\nStudies\
411 |             \ missing sort field are always last. Default sort direction:\n* Date field\
412 |             \ - `desc`\n* Numeric field - `asc`\n* `@relevance` - `desc`"
413 |           required: false
414 |           schema:
415 |             type: "array"
416 |             maxItems: 2
417 |             default: []
418 |             items:
419 |               type: "string"
420 |               pattern: "^(([a-zA-Z][a-zA-Z0-9\\-. ]*)|(@relevance))(:(asc|desc))?$"
421 |           examples:
422 |             example1:
423 |               value:
424 |                 - "@relevance"
425 |             example2:
426 |               value:
427 |                 - "LastUpdatePostDate"
428 |             example3:
429 |               value:
430 |                 - "EnrollmentCount:desc"
431 |                 - "NumArmGroups"
432 |         - name: "countTotal"
433 |           in: "query"
434 |           description:
435 |             "Count total number of studies in all pages and return `totalCount`\
436 |             \ field with first page, if `true`.\nFor CSV, the result can be found in\
437 |             \ `x-total-count` response header.\nThe parameter is ignored for the subsequent\
438 |             \ pages."
439 |           required: false
440 |           schema:
441 |             type: "boolean"
442 |             default: false
443 |         - name: "pageSize"
444 |           in: "query"
445 |           description:
446 |             "Page size is maximum number of studies to return in response.\
447 |             \ It does not have to be the same for every page.\nIf not specified or set\
448 |             \ to 0, the default value will be used. It will be coerced down to  1,000,\
449 |             \ if greater than that."
450 |           required: false
451 |           schema:
452 |             type: "integer"
453 |             format: "int32"
454 |             minimum: 0
455 |             default: 10
456 |           examples:
457 |             example1:
458 |               value: 2
459 |             example2:
460 |               value: 100
461 |         - name: "pageToken"
462 |           in: "query"
463 |           description:
464 |             "Token to get next page. Set it to a `nextPageToken` value returned\
465 |             \ with the previous page in JSON format.\nFor CSV, it can be found in `x-next-page-token`\
466 |             \ response header.\nDo not specify it for first page."
467 |           required: false
468 |           schema:
469 |             type: "string"
470 |       responses:
471 |         "200":
472 |           description: "OK"
473 |           content:
474 |             application/json:
475 |               schema:
476 |                 $ref: "#/components/schemas/PagedStudies"
477 |               example:
478 |                 totalCount: 438897
479 |                 studies:
480 |                   - protocolSection:
481 |                       identificationModule:
482 |                         nctId: "NCT03540771"
483 |                         briefTitle:
484 |                           "Introducing Palliative Care (PC) Within the Treatment\
485 |                           \ of End Stage Liver Disease (ESLD)"
486 |                       statusModule:
487 |                         overallStatus: "RECRUITING"
488 |                     hasResults: false
489 |                   - protocolSection:
490 |                       identificationModule:
491 |                         nctId: "NCT03630471"
492 |                         briefTitle:
493 |                           "Effectiveness of a Problem-solving Intervention\
494 |                           \ for Common Adolescent Mental Health Problems in India"
495 |                       statusModule:
496 |                         overallStatus: "COMPLETED"
497 |                     hasResults: false
498 |                   - protocolSection:
499 |                       identificationModule:
500 |                         nctId: "NCT00587795"
501 |                         briefTitle:
502 |                           "Orthopedic Study of the Aircast StabilAir Wrist\
503 |                           \ Fracture Brace"
504 |                       statusModule:
505 |                         overallStatus: "TERMINATED"
506 |                     hasResults: true
507 |                 nextPageToken: "abracadabra"
508 |         "400":
509 |           description: "Bad Request"
510 |           content:
511 |             text/plain:
512 |               schema:
513 |                 $ref: "#/components/schemas/errorMessage"
514 |   /studies/{nctId}:
515 |     get:
516 |       summary: "Single Study"
517 |       description: "Returns data of a single study."
518 |       tags:
519 |         - "Studies"
520 |       operationId: "fetchStudy"
521 |       parameters:
522 |         - name: "nctId"
523 |           in: "path"
524 |           description:
525 |             "NCT Number of a study. If found in [NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\
526 |             \ field,\n301 HTTP redirect to the actual study will be returned."
527 |           required: true
528 |           schema:
529 |             type: "string"
530 |             pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$"
531 |           examples:
532 |             example1:
533 |               value: "NCT00841061"
534 |             example2:
535 |               value: "NCT04000165"
536 |         - name: "format"
537 |           in: "query"
538 |           description:
539 |             "Must be one of the following:\n* `csv`- return CSV table; available\
540 |             \ fields are listed on [CSV Download](/data-api/about-api/csv-download)\n\
541 |             * `json`- return JSON object; format of `markup` fields depends on `markupFormat`\
542 |             \ parameter\n* `json.zip`- put JSON object into a .json file and download\
543 |             \ it as zip archive; field values of type `markup` are in [markdown](https://spec.commonmark.org/0.28/)\
544 |             \ format\n* `fhir.json` - return FHIR JSON; fields are not customizable;\
545 |             \ see [Access Data in FHIR](/data-api/fhir)\n* `ris`- return RIS record;\
546 |             \ available tags are listed on [RIS Download](/data-api/about-api/ris-download)"
547 |           required: false
548 |           schema:
549 |             type: "string"
550 |             enum:
551 |               - "csv"
552 |               - "json"
553 |               - "json.zip"
554 |               - "fhir.json"
555 |               - "ris"
556 |             default: "json"
557 |         - name: "markupFormat"
558 |           in: "query"
559 |           description:
560 |             "Format of `markup` type fields:\n* `markdown`- [markdown](https://spec.commonmark.org/0.28/)\
561 |             \ format\n* `legacy`- compatible with classic PRS\n\nApplicable only to\
562 |             \ `json` format."
563 |           required: false
564 |           schema:
565 |             type: "string"
566 |             enum:
567 |               - "markdown"
568 |               - "legacy"
569 |             default: "markdown"
570 |         - name: "fields"
571 |           in: "query"
572 |           style: "pipeDelimited"
573 |           explode: false
574 |           description:
575 |             "If specified, must be non-empty comma- or pipe-separated list\
576 |             \ of fields to return. If unspecified, all fields will be returned.\nOrder\
577 |             \ of the fields does not matter.\n\nFor `csv` format, specify list of columns.\
578 |             \ The column names are available on [CSV Download](/data-api/about-api/csv-download).\n\
579 |             \nFor `json` and `json.zip` formats, every list item is either area name,\
580 |             \ piece name, or field name.\nIf a piece or a field is a branch node, all\
581 |             \ descendant fields will be included.\nAll area names are available on [Search\
582 |             \ Areas](/data-api/about-api/search-areas),\nthe piece and field names -\
583 |             \ on [Data Structure](/data-api/about-api/study-data-structure) and also\
584 |             \ can be retrieved at `/studies/metadata` endpoint.\n\nFor `fhir.json` format,\
585 |             \ all available fields are returned and this parameter must be unspecified.\n\
586 |             \nFor `ris` format, specify list of tags. The tag names are available on\
587 |             \ [RIS Download](/data-api/about-api/ris-download)."
588 |           required: false
589 |           schema:
590 |             type: "array"
591 |             minItems: 1
592 |             items:
593 |               type: "string"
594 |               pattern: "^[a-zA-Z][a-zA-Z0-9\\-. ]*$"
595 |           examples:
596 |             example1:
597 |               value:
598 |                 - "NCTId"
599 |                 - "BriefTitle"
600 |                 - "Reference"
601 |             example2:
602 |               value:
603 |                 - "ConditionsModule"
604 |                 - "EligibilityModule"
605 |       responses:
606 |         "200":
607 |           description: "OK"
608 |           content:
609 |             text/csv:
610 |               schema:
611 |                 $ref: "#/components/schemas/StudiesCsv"
612 |             application/json:
613 |               schema:
614 |                 $ref: "#/components/schemas/Study"
615 |             application/zip:
616 |               schema:
617 |                 $ref: "#/components/schemas/StudiesZip"
618 |             application/fhir+json:
619 |               schema:
620 |                 $ref: "#/components/schemas/StudyFhir"
621 |         "301":
622 |           description: "Moved Permanently"
623 |           content: {}
624 |         "400":
625 |           description: "Bad Request"
626 |           content:
627 |             text/plain:
628 |               schema:
629 |                 $ref: "#/components/schemas/errorMessage"
630 |         "404":
631 |           description: "Not Found"
632 |           content:
633 |             text/plain:
634 |               schema:
635 |                 $ref: "#/components/schemas/errorMessage"
636 |   /studies/metadata:
637 |     get:
638 |       summary: "Data Model Fields"
639 |       description: "Returns study data model fields."
640 |       tags:
641 |         - "Studies"
642 |       operationId: "studiesMetadata"
643 |       parameters:
644 |         - name: "includeIndexedOnly"
645 |           in: "query"
646 |           description: "Include indexed-only fields, if `true`"
647 |           required: false
648 |           schema:
649 |             type: "boolean"
650 |             default: false
651 |         - name: "includeHistoricOnly"
652 |           in: "query"
653 |           description: "Include fields available only in historic data, if `true`"
654 |           required: false
655 |           schema:
656 |             type: "boolean"
657 |             default: false
658 |       responses:
659 |         "200":
660 |           description: "OK"
661 |           content:
662 |             application/json:
663 |               schema:
664 |                 $ref: "#/components/schemas/FieldNodeList"
665 |         "400":
666 |           description: "Bad Request"
667 |           content:
668 |             text/plain:
669 |               schema:
670 |                 $ref: "#/components/schemas/errorMessage"
671 | 
```