This is page 14 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /src/biomcp/variants/external.py: -------------------------------------------------------------------------------- ```python 1 | """External data sources for enhanced variant annotations.""" 2 | 3 | import asyncio 4 | import json 5 | import logging 6 | import re 7 | from typing import Any 8 | from urllib.parse import quote 9 | 10 | from pydantic import BaseModel, Field 11 | 12 | from .. import http_client 13 | 14 | # Import CBioPortalVariantData from the new module 15 | from .cbio_external_client import CBioPortalVariantData 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | # TCGA/GDC API endpoints 20 | GDC_BASE = "https://api.gdc.cancer.gov" 21 | GDC_SSMS_ENDPOINT = f"{GDC_BASE}/ssms" # Simple Somatic Mutations 22 | 23 | # 1000 Genomes API endpoints 24 | ENSEMBL_REST_BASE = "https://rest.ensembl.org" 25 | ENSEMBL_VARIATION_ENDPOINT = f"{ENSEMBL_REST_BASE}/variation/human" 26 | 27 | # Import constants 28 | 29 | 30 | class TCGAVariantData(BaseModel): 31 | """TCGA/GDC variant annotation data.""" 32 | 33 | cosmic_id: str | None = None 34 | tumor_types: list[str] = Field(default_factory=list) 35 | mutation_frequency: float | None = None 36 | mutation_count: int | None = None 37 | affected_cases: int | None = None 38 | consequence_type: str | None = None 39 | clinical_significance: str | None = None 40 | 41 | 42 | class ThousandGenomesData(BaseModel): 43 | """1000 Genomes variant annotation data.""" 44 | 45 | global_maf: float | None = Field( 46 | None, description="Global minor allele frequency" 47 | ) 48 | afr_maf: float | None = Field(None, description="African population MAF") 49 | amr_maf: float | None = Field(None, description="American population MAF") 50 | eas_maf: float | None = Field( 51 | None, description="East Asian population MAF" 52 | ) 53 | eur_maf: float | None = Field(None, description="European population MAF") 54 | sas_maf: float | None = Field( 55 | None, description="South Asian population MAF" 56 | ) 57 | ancestral_allele: str | None = None 58 | most_severe_consequence: str | None = None 59 | 60 | 61 | # CBioPortalVariantData is now imported from cbio_external_client.py 62 | 63 | 64 | class EnhancedVariantAnnotation(BaseModel): 65 | """Enhanced variant annotation combining multiple sources.""" 66 | 67 | variant_id: str 68 | tcga: TCGAVariantData | None = None 69 | thousand_genomes: ThousandGenomesData | None = None 70 | cbioportal: CBioPortalVariantData | None = None 71 | error_sources: list[str] = Field(default_factory=list) 72 | 73 | 74 | class TCGAClient: 75 | """Client for TCGA/GDC API.""" 76 | 77 | async def get_variant_data( 78 | self, variant_id: str 79 | ) -> TCGAVariantData | None: 80 | """Fetch variant data from TCGA/GDC. 81 | 82 | Args: 83 | variant_id: Can be gene AA change (e.g., "BRAF V600E") or genomic coordinates 84 | """ 85 | try: 86 | # Determine the search field based on variant_id format 87 | # If it looks like "GENE AA_CHANGE" format, use gene_aa_change field 88 | if " " in variant_id and not variant_id.startswith("chr"): 89 | search_field = "gene_aa_change" 90 | search_value = variant_id 91 | else: 92 | # Otherwise try genomic_dna_change 93 | search_field = "genomic_dna_change" 94 | search_value = variant_id 95 | 96 | # First, search for the variant 97 | params = { 98 | "filters": json.dumps({ 99 | "op": "in", 100 | "content": { 101 | "field": search_field, 102 | "value": [search_value], 103 | }, 104 | }), 105 | "fields": "cosmic_id,genomic_dna_change,gene_aa_change,ssm_id", 106 | "format": "json", 107 | "size": "5", # Get a few in case of multiple matches 108 | } 109 | 110 | response, error = await http_client.request_api( 111 | url=GDC_SSMS_ENDPOINT, 112 | method="GET", 113 | request=params, 114 | domain="gdc", 115 | ) 116 | 117 | if error or not response: 118 | return None 119 | 120 | data = response.get("data", {}) 121 | hits = data.get("hits", []) 122 | 123 | if not hits: 124 | return None 125 | 126 | # Get the first hit 127 | hit = hits[0] 128 | ssm_id = hit.get("ssm_id") 129 | cosmic_id = hit.get("cosmic_id") 130 | 131 | # For gene_aa_change searches, verify we have the right variant 132 | if search_field == "gene_aa_change": 133 | gene_aa_changes = hit.get("gene_aa_change", []) 134 | if ( 135 | isinstance(gene_aa_changes, list) 136 | and search_value not in gene_aa_changes 137 | ): 138 | # This SSM has multiple AA changes, but not the one we're looking for 139 | return None 140 | 141 | if not ssm_id: 142 | return None 143 | 144 | # Now query SSM occurrences to get project information 145 | occ_params = { 146 | "filters": json.dumps({ 147 | "op": "in", 148 | "content": {"field": "ssm.ssm_id", "value": [ssm_id]}, 149 | }), 150 | "fields": "case.project.project_id", 151 | "format": "json", 152 | "size": "2000", # Get more occurrences 153 | } 154 | 155 | occ_response, occ_error = await http_client.request_api( 156 | url="https://api.gdc.cancer.gov/ssm_occurrences", 157 | method="GET", 158 | request=occ_params, 159 | domain="gdc", 160 | ) 161 | 162 | if occ_error or not occ_response: 163 | # Return basic info without occurrence data 164 | cosmic_id_str = ( 165 | cosmic_id[0] 166 | if isinstance(cosmic_id, list) and cosmic_id 167 | else cosmic_id 168 | ) 169 | return TCGAVariantData( 170 | cosmic_id=cosmic_id_str, 171 | tumor_types=[], 172 | affected_cases=0, 173 | consequence_type="missense_variant", # Most COSMIC variants are missense 174 | ) 175 | 176 | # Process occurrence data 177 | occ_data = occ_response.get("data", {}) 178 | occ_hits = occ_data.get("hits", []) 179 | 180 | # Count by project 181 | project_counts = {} 182 | for occ in occ_hits: 183 | case = occ.get("case", {}) 184 | project = case.get("project", {}) 185 | if project_id := project.get("project_id"): 186 | project_counts[project_id] = ( 187 | project_counts.get(project_id, 0) + 1 188 | ) 189 | 190 | # Extract tumor types 191 | tumor_types = [] 192 | total_cases = 0 193 | for project_id, count in project_counts.items(): 194 | # Extract tumor type from project ID 195 | # TCGA format: "TCGA-LUAD" -> "LUAD" 196 | # Other formats: "MMRF-COMMPASS" -> "MMRF-COMMPASS", "CPTAC-3" -> "CPTAC-3" 197 | if project_id.startswith("TCGA-") and "-" in project_id: 198 | tumor_type = project_id.split("-")[-1] 199 | tumor_types.append(tumor_type) 200 | else: 201 | # For non-TCGA projects, use the full project ID 202 | tumor_types.append(project_id) 203 | total_cases += count 204 | 205 | # Handle cosmic_id as list 206 | cosmic_id_str = ( 207 | cosmic_id[0] 208 | if isinstance(cosmic_id, list) and cosmic_id 209 | else cosmic_id 210 | ) 211 | 212 | return TCGAVariantData( 213 | cosmic_id=cosmic_id_str, 214 | tumor_types=tumor_types, 215 | affected_cases=total_cases, 216 | consequence_type="missense_variant", # Default for now 217 | ) 218 | 219 | except (KeyError, ValueError, TypeError, IndexError) as e: 220 | # Log the error for debugging while gracefully handling API response issues 221 | # KeyError: Missing expected fields in API response 222 | # ValueError: Invalid data format or conversion issues 223 | # TypeError: Unexpected data types in response 224 | # IndexError: Array access issues with response data 225 | logger.warning( 226 | f"Failed to fetch TCGA variant data for {variant_id}: {type(e).__name__}: {e}" 227 | ) 228 | return None 229 | 230 | 231 | class ThousandGenomesClient: 232 | """Client for 1000 Genomes data via Ensembl REST API.""" 233 | 234 | def _extract_population_frequencies( 235 | self, populations: list[dict] 236 | ) -> dict[str, Any]: 237 | """Extract population frequencies from Ensembl response.""" 238 | # Note: Multiple entries per population (one per allele), we want the alternate allele frequency 239 | # The reference allele will have higher frequency for rare variants 240 | pop_data: dict[str, float] = {} 241 | 242 | for pop in populations: 243 | pop_name = pop.get("population", "") 244 | frequency = pop.get("frequency", 0) 245 | 246 | # Map 1000 Genomes population codes - taking the minor allele frequency 247 | if pop_name == "1000GENOMES:phase_3:ALL": 248 | if "global_maf" not in pop_data or frequency < pop_data.get( 249 | "global_maf", 1 250 | ): 251 | pop_data["global_maf"] = frequency 252 | elif pop_name == "1000GENOMES:phase_3:AFR": 253 | if "afr_maf" not in pop_data or frequency < pop_data.get( 254 | "afr_maf", 1 255 | ): 256 | pop_data["afr_maf"] = frequency 257 | elif pop_name == "1000GENOMES:phase_3:AMR": 258 | if "amr_maf" not in pop_data or frequency < pop_data.get( 259 | "amr_maf", 1 260 | ): 261 | pop_data["amr_maf"] = frequency 262 | elif pop_name == "1000GENOMES:phase_3:EAS": 263 | if "eas_maf" not in pop_data or frequency < pop_data.get( 264 | "eas_maf", 1 265 | ): 266 | pop_data["eas_maf"] = frequency 267 | elif pop_name == "1000GENOMES:phase_3:EUR": 268 | if "eur_maf" not in pop_data or frequency < pop_data.get( 269 | "eur_maf", 1 270 | ): 271 | pop_data["eur_maf"] = frequency 272 | elif pop_name == "1000GENOMES:phase_3:SAS" and ( 273 | "sas_maf" not in pop_data 274 | or frequency < pop_data.get("sas_maf", 1) 275 | ): 276 | pop_data["sas_maf"] = frequency 277 | 278 | return pop_data 279 | 280 | async def get_variant_data( 281 | self, variant_id: str 282 | ) -> ThousandGenomesData | None: 283 | """Fetch variant data from 1000 Genomes via Ensembl.""" 284 | try: 285 | # Try to get rsID or use the variant ID directly 286 | encoded_id = quote(variant_id, safe="") 287 | url = f"{ENSEMBL_VARIATION_ENDPOINT}/{encoded_id}" 288 | 289 | # Request with pops=1 to get population data 290 | params = {"content-type": "application/json", "pops": "1"} 291 | 292 | response, error = await http_client.request_api( 293 | url=url, 294 | method="GET", 295 | request=params, 296 | domain="ensembl", 297 | ) 298 | 299 | if error or not response: 300 | return None 301 | 302 | # Extract population frequencies 303 | populations = response.get("populations", []) 304 | pop_data = self._extract_population_frequencies(populations) 305 | 306 | # Get most severe consequence 307 | consequence = None 308 | if mappings := response.get("mappings", []): 309 | # Extract consequences from transcript consequences 310 | all_consequences = [] 311 | for mapping in mappings: 312 | if transcript_consequences := mapping.get( 313 | "transcript_consequences", [] 314 | ): 315 | for tc in transcript_consequences: 316 | if consequence_terms := tc.get( 317 | "consequence_terms", [] 318 | ): 319 | all_consequences.extend(consequence_terms) 320 | 321 | if all_consequences: 322 | # Take the first unique consequence 323 | seen = set() 324 | unique_consequences = [] 325 | for c in all_consequences: 326 | if c not in seen: 327 | seen.add(c) 328 | unique_consequences.append(c) 329 | consequence = ( 330 | unique_consequences[0] if unique_consequences else None 331 | ) 332 | 333 | # Only return data if we found population frequencies 334 | if pop_data: 335 | return ThousandGenomesData( 336 | **pop_data, 337 | ancestral_allele=response.get("ancestral_allele"), 338 | most_severe_consequence=consequence, 339 | ) 340 | else: 341 | # No population data found 342 | return None 343 | 344 | except (KeyError, ValueError, TypeError, AttributeError) as e: 345 | # Log the error for debugging while gracefully handling API response issues 346 | # KeyError: Missing expected fields in API response 347 | # ValueError: Invalid data format or conversion issues 348 | # TypeError: Unexpected data types in response 349 | # AttributeError: Missing attributes on response objects 350 | logger.warning( 351 | f"Failed to fetch 1000 Genomes data for {variant_id}: {type(e).__name__}: {e}" 352 | ) 353 | return None 354 | 355 | 356 | class ExternalVariantAggregator: 357 | """Aggregates variant data from multiple external sources.""" 358 | 359 | def __init__(self): 360 | self.tcga_client = TCGAClient() 361 | self.thousand_genomes_client = ThousandGenomesClient() 362 | # Import here to avoid circular imports 363 | from .cbio_external_client import CBioPortalExternalClient 364 | 365 | self.cbioportal_client = CBioPortalExternalClient() 366 | 367 | def _extract_gene_aa_change( 368 | self, variant_data: dict[str, Any] 369 | ) -> str | None: 370 | """Extract gene and AA change in format like 'BRAF V600A' from variant data.""" 371 | logger.info("_extract_gene_aa_change called") 372 | try: 373 | # First try to get gene name from CADD data 374 | gene_name = None 375 | if (cadd := variant_data.get("cadd")) and ( 376 | gene := cadd.get("gene") 377 | ): 378 | gene_name = gene.get("genename") 379 | 380 | # If not found in CADD, try other sources 381 | if not gene_name: 382 | # Try docm 383 | if docm := variant_data.get("docm"): 384 | gene_name = docm.get("gene") or docm.get("genename") 385 | 386 | # Try dbnsfp 387 | if not gene_name and (dbnsfp := variant_data.get("dbnsfp")): 388 | gene_name = dbnsfp.get("genename") 389 | 390 | if not gene_name: 391 | return None 392 | 393 | # Now try to get the protein change 394 | aa_change = None 395 | 396 | # Try to get from docm first (it has clean p.V600A format) 397 | if (docm := variant_data.get("docm")) and ( 398 | aa := docm.get("aa_change") 399 | ): 400 | # Convert p.V600A to V600A 401 | aa_change = aa.replace("p.", "") 402 | 403 | # Try hgvsp if not found 404 | if ( 405 | not aa_change 406 | and (hgvsp_list := variant_data.get("hgvsp")) 407 | and isinstance(hgvsp_list, list) 408 | and hgvsp_list 409 | ): 410 | # Take the first one and clean it 411 | hgvsp = hgvsp_list[0] 412 | # Remove p. prefix 413 | aa_change = hgvsp.replace("p.", "") 414 | # Handle formats like Val600Ala -> V600A 415 | if "Val" in aa_change or "Ala" in aa_change: 416 | # Try to extract the short form 417 | match = re.search(r"[A-Z]\d+[A-Z]", aa_change) 418 | if match: 419 | aa_change = match.group() 420 | 421 | # Try CADD data 422 | if ( 423 | not aa_change 424 | and (cadd := variant_data.get("cadd")) 425 | and (gene_info := cadd.get("gene")) 426 | and (prot := gene_info.get("prot")) 427 | ): 428 | protpos = prot.get("protpos") 429 | if protpos and cadd.get("oaa") and cadd.get("naa"): 430 | aa_change = f"{cadd['oaa']}{protpos}{cadd['naa']}" 431 | 432 | if gene_name and aa_change: 433 | result = f"{gene_name} {aa_change}" 434 | logger.info(f"Extracted gene/AA change: {result}") 435 | return result 436 | 437 | logger.warning( 438 | f"Failed to extract gene/AA change: gene_name={gene_name}, aa_change={aa_change}" 439 | ) 440 | return None 441 | except ( 442 | KeyError, 443 | ValueError, 444 | TypeError, 445 | AttributeError, 446 | re.error, 447 | ) as e: 448 | # Log the error for debugging while gracefully handling data extraction issues 449 | # KeyError: Missing expected fields in variant data 450 | # ValueError: Invalid data format or conversion issues 451 | # TypeError: Unexpected data types in variant data 452 | # AttributeError: Missing attributes on data objects 453 | # re.error: Regular expression matching errors 454 | logger.warning( 455 | f"Failed to extract gene/AA change from variant data: {type(e).__name__}: {e}" 456 | ) 457 | return None 458 | 459 | async def get_enhanced_annotations( 460 | self, 461 | variant_id: str, 462 | include_tcga: bool = True, 463 | include_1000g: bool = True, 464 | include_cbioportal: bool = True, 465 | variant_data: dict[str, Any] | None = None, 466 | ) -> EnhancedVariantAnnotation: 467 | """Fetch and aggregate variant annotations from external sources. 468 | 469 | Args: 470 | variant_id: The variant identifier (rsID or HGVS) 471 | include_tcga: Whether to include TCGA data 472 | include_1000g: Whether to include 1000 Genomes data 473 | include_cbioportal: Whether to include cBioPortal data 474 | variant_data: Optional variant data from MyVariant.info to extract gene/protein info 475 | """ 476 | logger.info( 477 | f"get_enhanced_annotations called for {variant_id}, include_cbioportal={include_cbioportal}" 478 | ) 479 | tasks: list[Any] = [] 480 | task_names = [] 481 | 482 | # Extract gene/AA change once for sources that need it 483 | gene_aa_change = None 484 | if variant_data: 485 | logger.info( 486 | f"Extracting gene/AA from variant_data keys: {list(variant_data.keys())}" 487 | ) 488 | gene_aa_change = self._extract_gene_aa_change(variant_data) 489 | else: 490 | logger.warning("No variant_data provided for gene/AA extraction") 491 | 492 | if include_tcga: 493 | # Try to extract gene and protein change from variant data for TCGA 494 | tcga_id = gene_aa_change if gene_aa_change else variant_id 495 | tasks.append(self.tcga_client.get_variant_data(tcga_id)) 496 | task_names.append("tcga") 497 | 498 | if include_1000g: 499 | tasks.append( 500 | self.thousand_genomes_client.get_variant_data(variant_id) 501 | ) 502 | task_names.append("thousand_genomes") 503 | 504 | if include_cbioportal and gene_aa_change: 505 | # cBioPortal requires gene/AA format 506 | logger.info( 507 | f"Adding cBioPortal task with gene_aa_change: {gene_aa_change}" 508 | ) 509 | tasks.append( 510 | self.cbioportal_client.get_variant_data(gene_aa_change) 511 | ) 512 | task_names.append("cbioportal") 513 | elif include_cbioportal and not gene_aa_change: 514 | logger.warning( 515 | "Skipping cBioPortal: no gene/AA change could be extracted" 516 | ) 517 | 518 | # Run all queries in parallel 519 | results = await asyncio.gather(*tasks, return_exceptions=True) 520 | 521 | # Build the enhanced annotation 522 | annotation = EnhancedVariantAnnotation(variant_id=variant_id) 523 | 524 | for _i, (result, name) in enumerate( 525 | zip(results, task_names, strict=False) 526 | ): 527 | if isinstance(result, Exception): 528 | annotation.error_sources.append(name) 529 | elif result is not None: 530 | setattr(annotation, name, result) 531 | else: 532 | # No data found for this source 533 | pass 534 | 535 | return annotation 536 | 537 | 538 | def format_enhanced_annotations( 539 | annotation: EnhancedVariantAnnotation, 540 | ) -> dict[str, Any]: 541 | """Format enhanced annotations for display.""" 542 | formatted: dict[str, Any] = { 543 | "variant_id": annotation.variant_id, 544 | "external_annotations": {}, 545 | } 546 | 547 | external_annot = formatted["external_annotations"] 548 | 549 | if annotation.tcga: 550 | external_annot["tcga"] = { 551 | "tumor_types": annotation.tcga.tumor_types, 552 | "affected_cases": annotation.tcga.affected_cases, 553 | "cosmic_id": annotation.tcga.cosmic_id, 554 | "consequence": annotation.tcga.consequence_type, 555 | } 556 | 557 | if annotation.thousand_genomes: 558 | external_annot["1000_genomes"] = { 559 | "global_maf": annotation.thousand_genomes.global_maf, 560 | "population_frequencies": { 561 | "african": annotation.thousand_genomes.afr_maf, 562 | "american": annotation.thousand_genomes.amr_maf, 563 | "east_asian": annotation.thousand_genomes.eas_maf, 564 | "european": annotation.thousand_genomes.eur_maf, 565 | "south_asian": annotation.thousand_genomes.sas_maf, 566 | }, 567 | "ancestral_allele": annotation.thousand_genomes.ancestral_allele, 568 | "consequence": annotation.thousand_genomes.most_severe_consequence, 569 | } 570 | 571 | if annotation.cbioportal: 572 | cbio_data: dict[str, Any] = { 573 | "studies": annotation.cbioportal.studies, 574 | "total_cases": annotation.cbioportal.total_cases, 575 | } 576 | 577 | # Add cancer type distribution if available 578 | if annotation.cbioportal.cancer_type_distribution: 579 | cbio_data["cancer_types"] = ( 580 | annotation.cbioportal.cancer_type_distribution 581 | ) 582 | 583 | # Add mutation type distribution if available 584 | if annotation.cbioportal.mutation_types: 585 | cbio_data["mutation_types"] = annotation.cbioportal.mutation_types 586 | 587 | # Add hotspot count if > 0 588 | if annotation.cbioportal.hotspot_count > 0: 589 | cbio_data["hotspot_samples"] = annotation.cbioportal.hotspot_count 590 | 591 | # Add mean VAF if available 592 | if annotation.cbioportal.mean_vaf is not None: 593 | cbio_data["mean_vaf"] = annotation.cbioportal.mean_vaf 594 | 595 | # Add sample type distribution if available 596 | if annotation.cbioportal.sample_types: 597 | cbio_data["sample_types"] = annotation.cbioportal.sample_types 598 | 599 | external_annot["cbioportal"] = cbio_data 600 | 601 | if annotation.error_sources: 602 | external_annot["errors"] = annotation.error_sources 603 | 604 | return formatted 605 | ``` -------------------------------------------------------------------------------- /tests/tdd/trials/test_search.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from biomcp.trials.search import ( 4 | CLOSED_STATUSES, 5 | AgeGroup, 6 | DateField, 7 | InterventionType, 8 | LineOfTherapy, 9 | PrimaryPurpose, 10 | RecruitingStatus, 11 | SortOrder, 12 | SponsorType, 13 | StudyDesign, 14 | StudyType, 15 | TrialPhase, 16 | TrialQuery, 17 | _build_biomarker_expression_essie, 18 | _build_brain_mets_essie, 19 | _build_excluded_mutations_essie, 20 | _build_line_of_therapy_essie, 21 | _build_prior_therapy_essie, 22 | _build_progression_essie, 23 | _build_required_mutations_essie, 24 | _inject_ids, 25 | convert_query, 26 | ) 27 | 28 | 29 | @pytest.mark.asyncio 30 | async def test_convert_query_basic_parameters(): 31 | """Test basic parameter conversion from TrialQuery to API format.""" 32 | query = TrialQuery(conditions=["lung cancer"]) 33 | params = await convert_query(query) 34 | 35 | assert "markupFormat" in params 36 | assert params["markupFormat"] == ["markdown"] 37 | assert "query.cond" in params 38 | assert params["query.cond"] == ["lung cancer"] 39 | assert "filter.overallStatus" in params 40 | assert "RECRUITING" in params["filter.overallStatus"][0] 41 | 42 | 43 | @pytest.mark.asyncio 44 | async def test_convert_query_multiple_conditions(): 45 | """Test conversion of multiple conditions to API format.""" 46 | query = TrialQuery(conditions=["lung cancer", "metastatic"]) 47 | params = await convert_query(query) 48 | 49 | assert "query.cond" in params 50 | # The query should contain the original terms, but may have expanded synonyms 51 | cond_value = params["query.cond"][0] 52 | assert "lung cancer" in cond_value 53 | assert "metastatic" in cond_value 54 | assert cond_value.startswith("(") and cond_value.endswith(")") 55 | 56 | 57 | @pytest.mark.asyncio 58 | async def test_convert_query_terms_parameter(): 59 | """Test conversion of terms parameter to API format.""" 60 | query = TrialQuery(terms=["immunotherapy"]) 61 | params = await convert_query(query) 62 | 63 | assert "query.term" in params 64 | assert params["query.term"] == ["immunotherapy"] 65 | 66 | 67 | @pytest.mark.asyncio 68 | async def test_convert_query_interventions_parameter(): 69 | """Test conversion of interventions parameter to API format.""" 70 | query = TrialQuery(interventions=["pembrolizumab"]) 71 | params = await convert_query(query) 72 | 73 | assert "query.intr" in params 74 | assert params["query.intr"] == ["pembrolizumab"] 75 | 76 | 77 | @pytest.mark.asyncio 78 | async def test_convert_query_nct_ids(): 79 | """Test conversion of NCT IDs to API format.""" 80 | query = TrialQuery(nct_ids=["NCT04179552"]) 81 | params = await convert_query(query) 82 | 83 | assert "query.id" in params 84 | assert params["query.id"] == ["NCT04179552"] 85 | # Note: The implementation keeps filter.overallStatus when using nct_ids 86 | # So we don't assert its absence 87 | 88 | 89 | @pytest.mark.asyncio 90 | async def test_convert_query_recruiting_status(): 91 | """Test conversion of recruiting status to API format.""" 92 | # Test open status 93 | query = TrialQuery(recruiting_status=RecruitingStatus.OPEN) 94 | params = await convert_query(query) 95 | 96 | assert "filter.overallStatus" in params 97 | assert "RECRUITING" in params["filter.overallStatus"][0] 98 | 99 | # Test closed status 100 | query = TrialQuery(recruiting_status=RecruitingStatus.CLOSED) 101 | params = await convert_query(query) 102 | 103 | assert "filter.overallStatus" in params 104 | assert all( 105 | status in params["filter.overallStatus"][0] 106 | for status in CLOSED_STATUSES 107 | ) 108 | 109 | # Test any status 110 | query = TrialQuery(recruiting_status=RecruitingStatus.ANY) 111 | params = await convert_query(query) 112 | 113 | assert "filter.overallStatus" not in params 114 | 115 | 116 | @pytest.mark.asyncio 117 | async def test_convert_query_location_parameters(): 118 | """Test conversion of location parameters to API format.""" 119 | query = TrialQuery(lat=40.7128, long=-74.0060, distance=10) 120 | params = await convert_query(query) 121 | 122 | assert "filter.geo" in params 123 | assert params["filter.geo"] == ["distance(40.7128,-74.006,10mi)"] 124 | 125 | 126 | @pytest.mark.asyncio 127 | async def test_convert_query_study_type(): 128 | """Test conversion of study type to API format.""" 129 | query = TrialQuery(study_type=StudyType.INTERVENTIONAL) 130 | params = await convert_query(query) 131 | 132 | assert "filter.advanced" in params 133 | assert "AREA[StudyType]Interventional" in params["filter.advanced"][0] 134 | 135 | 136 | @pytest.mark.asyncio 137 | async def test_convert_query_phase(): 138 | """Test conversion of phase to API format.""" 139 | query = TrialQuery(phase=TrialPhase.PHASE3) 140 | params = await convert_query(query) 141 | 142 | assert "filter.advanced" in params 143 | assert "AREA[Phase]PHASE3" in params["filter.advanced"][0] 144 | 145 | 146 | @pytest.mark.asyncio 147 | async def test_convert_query_date_range(): 148 | """Test conversion of date range to API format.""" 149 | query = TrialQuery( 150 | min_date="2020-01-01", 151 | max_date="2020-12-31", 152 | date_field=DateField.LAST_UPDATE, 153 | ) 154 | params = await convert_query(query) 155 | 156 | assert "filter.advanced" in params 157 | assert ( 158 | "AREA[LastUpdatePostDate]RANGE[2020-01-01,2020-12-31]" 159 | in params["filter.advanced"][0] 160 | ) 161 | 162 | # Test min date only 163 | query = TrialQuery( 164 | min_date="2021-01-01", 165 | date_field=DateField.STUDY_START, 166 | ) 167 | params = await convert_query(query) 168 | 169 | assert "filter.advanced" in params 170 | assert ( 171 | "AREA[StartDate]RANGE[2021-01-01,MAX]" in params["filter.advanced"][0] 172 | ) 173 | 174 | 175 | @pytest.mark.asyncio 176 | async def test_convert_query_sort_order(): 177 | """Test conversion of sort order to API format.""" 178 | query = TrialQuery(sort=SortOrder.RELEVANCE) 179 | params = await convert_query(query) 180 | 181 | assert "sort" in params 182 | assert params["sort"] == ["@relevance"] 183 | 184 | query = TrialQuery(sort=SortOrder.LAST_UPDATE) 185 | params = await convert_query(query) 186 | 187 | assert "sort" in params 188 | assert params["sort"] == ["LastUpdatePostDate:desc"] 189 | 190 | 191 | @pytest.mark.asyncio 192 | async def test_convert_query_intervention_type(): 193 | """Test conversion of intervention type to API format.""" 194 | query = TrialQuery(intervention_type=InterventionType.DRUG) 195 | params = await convert_query(query) 196 | 197 | assert "filter.advanced" in params 198 | assert "AREA[InterventionType]Drug" in params["filter.advanced"][0] 199 | 200 | 201 | @pytest.mark.asyncio 202 | async def test_convert_query_sponsor_type(): 203 | """Test conversion of sponsor type to API format.""" 204 | query = TrialQuery(sponsor_type=SponsorType.ACADEMIC) 205 | params = await convert_query(query) 206 | 207 | assert "filter.advanced" in params 208 | assert "AREA[SponsorType]Academic" in params["filter.advanced"][0] 209 | 210 | 211 | @pytest.mark.asyncio 212 | async def test_convert_query_study_design(): 213 | """Test conversion of study design to API format.""" 214 | query = TrialQuery(study_design=StudyDesign.RANDOMIZED) 215 | params = await convert_query(query) 216 | 217 | assert "filter.advanced" in params 218 | assert "AREA[StudyDesign]Randomized" in params["filter.advanced"][0] 219 | 220 | 221 | @pytest.mark.asyncio 222 | async def test_convert_query_age_group(): 223 | """Test conversion of age group to API format.""" 224 | query = TrialQuery(age_group=AgeGroup.ADULT) 225 | params = await convert_query(query) 226 | 227 | assert "filter.advanced" in params 228 | assert "AREA[StdAge]Adult" in params["filter.advanced"][0] 229 | 230 | 231 | @pytest.mark.asyncio 232 | async def test_convert_query_primary_purpose(): 233 | """Test conversion of primary purpose to API format.""" 234 | query = TrialQuery(primary_purpose=PrimaryPurpose.TREATMENT) 235 | params = await convert_query(query) 236 | 237 | assert "filter.advanced" in params 238 | assert ( 239 | "AREA[DesignPrimaryPurpose]Treatment" in params["filter.advanced"][0] 240 | ) 241 | 242 | 243 | @pytest.mark.asyncio 244 | async def test_convert_query_next_page_hash(): 245 | """Test conversion of next_page_hash to API format.""" 246 | query = TrialQuery(next_page_hash="abc123") 247 | params = await convert_query(query) 248 | 249 | assert "pageToken" in params 250 | assert params["pageToken"] == ["abc123"] 251 | 252 | 253 | @pytest.mark.asyncio 254 | async def test_convert_query_complex_parameters(): 255 | """Test conversion of multiple parameters to API format.""" 256 | query = TrialQuery( 257 | conditions=["diabetes"], 258 | terms=["obesity"], 259 | interventions=["metformin"], 260 | primary_purpose=PrimaryPurpose.TREATMENT, 261 | study_type=StudyType.INTERVENTIONAL, 262 | intervention_type=InterventionType.DRUG, 263 | recruiting_status=RecruitingStatus.OPEN, 264 | phase=TrialPhase.PHASE3, 265 | age_group=AgeGroup.ADULT, 266 | sort=SortOrder.RELEVANCE, 267 | ) 268 | params = await convert_query(query) 269 | 270 | assert "query.cond" in params 271 | # Disease synonym expansion may add synonyms to diabetes 272 | assert "diabetes" in params["query.cond"][0] 273 | assert "query.term" in params 274 | assert params["query.term"] == ["obesity"] 275 | assert "query.intr" in params 276 | assert params["query.intr"] == ["metformin"] 277 | assert "filter.advanced" in params 278 | assert ( 279 | "AREA[DesignPrimaryPurpose]Treatment" in params["filter.advanced"][0] 280 | ) 281 | assert "AREA[StudyType]Interventional" in params["filter.advanced"][0] 282 | assert "AREA[InterventionType]Drug" in params["filter.advanced"][0] 283 | assert "AREA[Phase]PHASE3" in params["filter.advanced"][0] 284 | assert "AREA[StdAge]Adult" in params["filter.advanced"][0] 285 | assert "filter.overallStatus" in params 286 | assert "RECRUITING" in params["filter.overallStatus"][0] 287 | assert "sort" in params 288 | assert params["sort"] == ["@relevance"] 289 | 290 | 291 | # Test TrialQuery field validation for CLI input processing 292 | # noinspection PyTypeChecker 293 | def test_trial_query_field_validation_basic(): 294 | """Test basic field validation for TrialQuery.""" 295 | # Test list fields conversion 296 | query = TrialQuery(conditions="diabetes") 297 | assert query.conditions == ["diabetes"] 298 | 299 | query = TrialQuery(interventions="metformin") 300 | assert query.interventions == ["metformin"] 301 | 302 | query = TrialQuery(terms="blood glucose") 303 | assert query.terms == ["blood glucose"] 304 | 305 | query = TrialQuery(nct_ids="NCT01234567") 306 | assert query.nct_ids == ["NCT01234567"] 307 | 308 | 309 | # noinspection PyTypeChecker 310 | def test_trial_query_field_validation_recruiting_status(): 311 | """Test recruiting status field validation.""" 312 | # Exact match uppercase 313 | query = TrialQuery(recruiting_status="OPEN") 314 | assert query.recruiting_status == RecruitingStatus.OPEN 315 | 316 | # Exact match lowercase 317 | query = TrialQuery(recruiting_status="closed") 318 | assert query.recruiting_status == RecruitingStatus.CLOSED 319 | 320 | # Invalid value 321 | with pytest.raises(ValueError) as excinfo: 322 | TrialQuery(recruiting_status="invalid") 323 | assert "validation error for TrialQuery" in str(excinfo.value) 324 | 325 | 326 | # noinspection PyTypeChecker 327 | @pytest.mark.asyncio 328 | async def test_trial_query_field_validation_combined(): 329 | """Test combined parameters validation.""" 330 | query = TrialQuery( 331 | conditions=["diabetes", "obesity"], 332 | interventions="metformin", 333 | recruiting_status="open", 334 | study_type="interventional", 335 | lat=40.7128, 336 | long=-74.0060, 337 | distance=10, 338 | ) 339 | 340 | assert query.conditions == ["diabetes", "obesity"] 341 | assert query.interventions == ["metformin"] 342 | assert query.recruiting_status == RecruitingStatus.OPEN 343 | assert query.study_type == StudyType.INTERVENTIONAL 344 | assert query.lat == 40.7128 345 | assert query.long == -74.0060 346 | assert query.distance == 10 347 | 348 | # Check that the query can be converted to parameters properly 349 | params = await convert_query(query) 350 | assert "query.cond" in params 351 | # The query should contain the original terms, but may have expanded synonyms 352 | cond_value = params["query.cond"][0] 353 | assert "diabetes" in cond_value 354 | assert "obesity" in cond_value 355 | assert cond_value.startswith("(") and cond_value.endswith(")") 356 | assert "query.intr" in params 357 | assert "metformin" in params["query.intr"][0] 358 | assert "filter.geo" in params 359 | assert "distance(40.7128,-74.006,10mi)" in params["filter.geo"][0] 360 | 361 | 362 | # noinspection PyTypeChecker 363 | @pytest.mark.asyncio 364 | async def test_trial_query_field_validation_terms(): 365 | """Test terms parameter validation.""" 366 | # Single term as string 367 | query = TrialQuery(terms="cancer") 368 | assert query.terms == ["cancer"] 369 | 370 | # Multiple terms as list 371 | query = TrialQuery(terms=["cancer", "therapy"]) 372 | assert query.terms == ["cancer", "therapy"] 373 | 374 | # Check parameter generation 375 | params = await convert_query(query) 376 | assert "query.term" in params 377 | assert "(cancer OR therapy)" in params["query.term"][0] 378 | 379 | 380 | # noinspection PyTypeChecker 381 | @pytest.mark.asyncio 382 | async def test_trial_query_field_validation_nct_ids(): 383 | """Test NCT IDs parameter validation.""" 384 | # Single NCT ID 385 | query = TrialQuery(nct_ids="NCT01234567") 386 | assert query.nct_ids == ["NCT01234567"] 387 | 388 | # Multiple NCT IDs 389 | query = TrialQuery(nct_ids=["NCT01234567", "NCT89012345"]) 390 | assert query.nct_ids == ["NCT01234567", "NCT89012345"] 391 | 392 | # Check parameter generation 393 | params = await convert_query(query) 394 | assert "query.id" in params 395 | assert "NCT01234567,NCT89012345" in params["query.id"][0] 396 | 397 | 398 | # noinspection PyTypeChecker 399 | @pytest.mark.asyncio 400 | async def test_trial_query_field_validation_date_range(): 401 | """Test date range parameters validation.""" 402 | # Min date only with date field 403 | query = TrialQuery(min_date="2020-01-01", date_field=DateField.STUDY_START) 404 | assert query.min_date == "2020-01-01" 405 | assert query.date_field == DateField.STUDY_START 406 | 407 | # Min and max date with date field using lazy mapping 408 | query = TrialQuery( 409 | min_date="2020-01-01", 410 | max_date="2021-12-31", 411 | date_field="last update", # space not underscore. 412 | ) 413 | assert query.min_date == "2020-01-01" 414 | assert query.max_date == "2021-12-31" 415 | assert query.date_field == DateField.LAST_UPDATE 416 | 417 | # Check parameter generation 418 | params = await convert_query(query) 419 | assert "filter.advanced" in params 420 | assert ( 421 | "AREA[LastUpdatePostDate]RANGE[2020-01-01,2021-12-31]" 422 | in params["filter.advanced"][0] 423 | ) 424 | 425 | 426 | # noinspection PyTypeChecker 427 | def test_trial_query_field_validation_primary_purpose(): 428 | """Test primary purpose parameter validation.""" 429 | # Exact match uppercase 430 | query = TrialQuery(primary_purpose=PrimaryPurpose.TREATMENT) 431 | assert query.primary_purpose == PrimaryPurpose.TREATMENT 432 | 433 | # Exact match lowercase 434 | query = TrialQuery(primary_purpose=PrimaryPurpose.PREVENTION) 435 | assert query.primary_purpose == PrimaryPurpose.PREVENTION 436 | 437 | # Case-insensitive 438 | query = TrialQuery(primary_purpose="ScReeNING") 439 | assert query.primary_purpose == PrimaryPurpose.SCREENING 440 | 441 | # Invalid 442 | with pytest.raises(ValueError): 443 | TrialQuery(primary_purpose="invalid") 444 | 445 | 446 | def test_inject_ids_with_many_ids_and_condition(): 447 | """Test _inject_ids function with 300 IDs and a condition to ensure filter.ids is used.""" 448 | # Create a params dict with a condition (indicating other filters present) 449 | params = { 450 | "query.cond": ["melanoma"], 451 | "format": ["json"], 452 | "markupFormat": ["markdown"], 453 | } 454 | 455 | # Generate 300 NCT IDs 456 | nct_ids = [f"NCT{str(i).zfill(8)}" for i in range(1, 301)] 457 | 458 | # Call _inject_ids with has_other_filters=True 459 | _inject_ids(params, nct_ids, has_other_filters=True) 460 | 461 | # Assert that filter.ids is used (not query.id) 462 | assert "filter.ids" in params 463 | assert "query.id" not in params 464 | 465 | # Verify the IDs are properly formatted 466 | ids_param = params["filter.ids"][0] 467 | assert ids_param.startswith("NCT") 468 | assert "NCT00000001" in ids_param 469 | assert "NCT00000300" in ids_param 470 | 471 | # Verify it's a comma-separated list 472 | assert "," in ids_param 473 | assert ids_param.count(",") == 299 # 300 IDs = 299 commas 474 | 475 | 476 | def test_inject_ids_without_other_filters(): 477 | """Test _inject_ids function with only NCT IDs (no other filters).""" 478 | # Create a minimal params dict 479 | params = { 480 | "format": ["json"], 481 | "markupFormat": ["markdown"], 482 | } 483 | 484 | # Use a small number of NCT IDs 485 | nct_ids = ["NCT00000001", "NCT00000002", "NCT00000003"] 486 | 487 | # Call _inject_ids with has_other_filters=False 488 | _inject_ids(params, nct_ids, has_other_filters=False) 489 | 490 | # Assert that query.id is used (not filter.ids) for small lists 491 | assert "query.id" in params 492 | assert "filter.ids" not in params 493 | 494 | # Verify the format 495 | assert params["query.id"][0] == "NCT00000001,NCT00000002,NCT00000003" 496 | 497 | 498 | def test_inject_ids_large_list_without_filters(): 499 | """Test _inject_ids with a large ID list but no other filters.""" 500 | params = { 501 | "format": ["json"], 502 | "markupFormat": ["markdown"], 503 | } 504 | 505 | # Generate enough IDs to exceed 1800 character limit 506 | nct_ids = [f"NCT{str(i).zfill(8)}" for i in range(1, 201)] # ~2200 chars 507 | 508 | # Call _inject_ids with has_other_filters=False 509 | _inject_ids(params, nct_ids, has_other_filters=False) 510 | 511 | # Assert that filter.ids is used for large lists even without other filters 512 | assert "filter.ids" in params 513 | assert "query.id" not in params 514 | 515 | 516 | # Tests for new Essie builder functions 517 | def test_build_prior_therapy_essie(): 518 | """Test building Essie fragments for prior therapies.""" 519 | # Single therapy 520 | fragments = _build_prior_therapy_essie(["osimertinib"]) 521 | assert len(fragments) == 1 522 | assert ( 523 | fragments[0] 524 | == 'AREA[EligibilityCriteria]("osimertinib" AND (prior OR previous OR received))' 525 | ) 526 | 527 | # Multiple therapies 528 | fragments = _build_prior_therapy_essie(["osimertinib", "erlotinib"]) 529 | assert len(fragments) == 2 530 | assert ( 531 | fragments[0] 532 | == 'AREA[EligibilityCriteria]("osimertinib" AND (prior OR previous OR received))' 533 | ) 534 | assert ( 535 | fragments[1] 536 | == 'AREA[EligibilityCriteria]("erlotinib" AND (prior OR previous OR received))' 537 | ) 538 | 539 | # Empty strings are filtered out 540 | fragments = _build_prior_therapy_essie(["osimertinib", "", "erlotinib"]) 541 | assert len(fragments) == 2 542 | 543 | 544 | def test_build_progression_essie(): 545 | """Test building Essie fragments for progression on therapy.""" 546 | fragments = _build_progression_essie(["pembrolizumab"]) 547 | assert len(fragments) == 1 548 | assert ( 549 | fragments[0] 550 | == 'AREA[EligibilityCriteria]("pembrolizumab" AND (progression OR resistant OR refractory))' 551 | ) 552 | 553 | 554 | def test_build_required_mutations_essie(): 555 | """Test building Essie fragments for required mutations.""" 556 | fragments = _build_required_mutations_essie(["EGFR L858R", "T790M"]) 557 | assert len(fragments) == 2 558 | assert fragments[0] == 'AREA[EligibilityCriteria]("EGFR L858R")' 559 | assert fragments[1] == 'AREA[EligibilityCriteria]("T790M")' 560 | 561 | 562 | def test_build_excluded_mutations_essie(): 563 | """Test building Essie fragments for excluded mutations.""" 564 | fragments = _build_excluded_mutations_essie(["KRAS G12C"]) 565 | assert len(fragments) == 1 566 | assert fragments[0] == 'AREA[EligibilityCriteria](NOT "KRAS G12C")' 567 | 568 | 569 | def test_build_biomarker_expression_essie(): 570 | """Test building Essie fragments for biomarker expression.""" 571 | biomarkers = {"PD-L1": "≥50%", "TMB": "≥10 mut/Mb"} 572 | fragments = _build_biomarker_expression_essie(biomarkers) 573 | assert len(fragments) == 2 574 | assert 'AREA[EligibilityCriteria]("PD-L1" AND "≥50%")' in fragments 575 | assert 'AREA[EligibilityCriteria]("TMB" AND "≥10 mut/Mb")' in fragments 576 | 577 | # Empty values are filtered out 578 | biomarkers = {"PD-L1": "≥50%", "TMB": "", "HER2": "positive"} 579 | fragments = _build_biomarker_expression_essie(biomarkers) 580 | assert len(fragments) == 2 581 | 582 | 583 | def test_build_line_of_therapy_essie(): 584 | """Test building Essie fragment for line of therapy.""" 585 | # First line 586 | fragment = _build_line_of_therapy_essie(LineOfTherapy.FIRST_LINE) 587 | assert ( 588 | fragment 589 | == 'AREA[EligibilityCriteria]("first line" OR "first-line" OR "1st line" OR "frontline" OR "treatment naive" OR "previously untreated")' 590 | ) 591 | 592 | # Second line 593 | fragment = _build_line_of_therapy_essie(LineOfTherapy.SECOND_LINE) 594 | assert ( 595 | fragment 596 | == 'AREA[EligibilityCriteria]("second line" OR "second-line" OR "2nd line" OR "one prior line" OR "1 prior line")' 597 | ) 598 | 599 | # Third line plus 600 | fragment = _build_line_of_therapy_essie(LineOfTherapy.THIRD_LINE_PLUS) 601 | assert ( 602 | fragment 603 | == 'AREA[EligibilityCriteria]("third line" OR "third-line" OR "3rd line" OR "≥2 prior" OR "at least 2 prior" OR "heavily pretreated")' 604 | ) 605 | 606 | 607 | def test_build_brain_mets_essie(): 608 | """Test building Essie fragment for brain metastases filter.""" 609 | # Allow brain mets (no filter) 610 | fragment = _build_brain_mets_essie(True) 611 | assert fragment == "" 612 | 613 | # Exclude brain mets 614 | fragment = _build_brain_mets_essie(False) 615 | assert fragment == 'AREA[EligibilityCriteria](NOT "brain metastases")' 616 | 617 | 618 | @pytest.mark.asyncio 619 | async def test_convert_query_with_eligibility_fields(): 620 | """Test conversion of query with new eligibility-focused fields.""" 621 | query = TrialQuery( 622 | conditions=["lung cancer"], 623 | prior_therapies=["osimertinib"], 624 | progression_on=["erlotinib"], 625 | required_mutations=["EGFR L858R"], 626 | excluded_mutations=["T790M"], 627 | biomarker_expression={"PD-L1": "≥50%"}, 628 | line_of_therapy=LineOfTherapy.SECOND_LINE, 629 | allow_brain_mets=False, 630 | ) 631 | params = await convert_query(query) 632 | 633 | # Check that query.term contains all the Essie fragments 634 | assert "query.term" in params 635 | term = params["query.term"][0] 636 | 637 | # Prior therapy 638 | assert ( 639 | 'AREA[EligibilityCriteria]("osimertinib" AND (prior OR previous OR received))' 640 | in term 641 | ) 642 | 643 | # Progression 644 | assert ( 645 | 'AREA[EligibilityCriteria]("erlotinib" AND (progression OR resistant OR refractory))' 646 | in term 647 | ) 648 | 649 | # Required mutation 650 | assert 'AREA[EligibilityCriteria]("EGFR L858R")' in term 651 | 652 | # Excluded mutation 653 | assert 'AREA[EligibilityCriteria](NOT "T790M")' in term 654 | 655 | # Biomarker expression 656 | assert 'AREA[EligibilityCriteria]("PD-L1" AND "≥50%")' in term 657 | 658 | # Line of therapy 659 | assert 'AREA[EligibilityCriteria]("second line" OR "second-line"' in term 660 | 661 | # Brain mets exclusion 662 | assert 'AREA[EligibilityCriteria](NOT "brain metastases")' in term 663 | 664 | # All fragments should be combined with AND 665 | assert " AND " in term 666 | 667 | 668 | @pytest.mark.asyncio 669 | async def test_convert_query_with_custom_fields_and_page_size(): 670 | """Test conversion of query with custom return fields and page size.""" 671 | query = TrialQuery( 672 | conditions=["diabetes"], 673 | return_fields=["NCTId", "BriefTitle", "OverallStatus"], 674 | page_size=100, 675 | ) 676 | params = await convert_query(query) 677 | 678 | assert "fields" in params 679 | assert params["fields"] == ["NCTId,BriefTitle,OverallStatus"] 680 | 681 | assert "pageSize" in params 682 | assert params["pageSize"] == ["100"] 683 | 684 | 685 | @pytest.mark.asyncio 686 | async def test_convert_query_eligibility_with_existing_terms(): 687 | """Test that eligibility Essie fragments are properly combined with existing terms.""" 688 | query = TrialQuery( 689 | terms=["immunotherapy"], 690 | prior_therapies=["chemotherapy"], 691 | ) 692 | params = await convert_query(query) 693 | 694 | assert "query.term" in params 695 | term = params["query.term"][0] 696 | 697 | # Should contain both the original term and the new Essie fragment 698 | assert "immunotherapy" in term 699 | assert ( 700 | 'AREA[EligibilityCriteria]("chemotherapy" AND (prior OR previous OR received))' 701 | in term 702 | ) 703 | # Should be combined with AND 704 | assert "immunotherapy AND AREA[EligibilityCriteria]" in term 705 | ``` -------------------------------------------------------------------------------- /tests/data/pubtator/pubtator3_paper.txt: -------------------------------------------------------------------------------- ``` 1 | Nucleic Acids Research, 2024, 52, W540–W546 2 | https://doi.org/10.1093/nar/gkae235 3 | Advance access publication date: 4 April 2024 4 | Web Server issue 5 | 6 | PubTator 3.0: an AI-powered literature resource for 7 | unlocking biomedical knowledge 8 | Chih-Hsuan Wei † , Alexis Allot † , Po-Ting Lai , Robert Leaman , Shubo Tian , Ling Luo , 9 | Qiao Jin , Zhizheng Wang , Qingyu Chen and Zhiyong Lu * 10 | National Center for Biotechnology Information (NCBI), National Library of Medicine (NLM), National Institutes of Health (NIH), 11 | Bethesda, MD 20894, USA 12 | To whom correspondence should be addressed. Tel: +1 301 594 7089; Email: [email protected] 13 | The first two authors should be regarded as Joint First Authors. 14 | Present addresses: 15 | Alexis Allot, The Neuro (Montreal Neurological Institute-Hospital), McGill University, Montreal, Quebec H3A 2B4, Canada. 16 | Ling Luo, School of Computer Science and Technology, Dalian University of Technology, 116024 Dalian, China. 17 | Qingyu Chen, Biomedical Informatics and Data Science, Yale School of Medicine, New Haven, CT 06510, USA. 18 | † 19 | 20 | Abstract 21 | PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a biomedical literature resource using state-of-the-art AI techniques to offer 22 | semantic and relation searches for key concepts like proteins, genetic variants, diseases and chemicals. It currently provides over one billion 23 | entity and relation annotations across approximately 36 million PubMed abstracts and 6 million full-text articles from the PMC open access 24 | subset, updated weekly. PubTator 3.0’s online interface and API utilize these precomputed entity relations and synonyms to provide advanced 25 | search capabilities and enable large-scale analyses, streamlining many complex information needs. We showcase the retrieval quality of PubTator 26 | 3.0 using a series of entity pair queries, demonstrating that PubTator 3.0 retrieves a greater number of articles than either PubMed or Google 27 | Scholar, with higher precision in the top 20 results. We further show that integrating ChatGPT (GPT-4) with PubTator APIs dramatically improves 28 | the factuality and verifiability of its responses. In summary, PubTator 3.0 offers a comprehensive set of features and tools that allow researchers 29 | to navigate the ever-expanding wealth of biomedical literature, expediting research and unlocking valuable insights for scientific discovery. 30 | 31 | Graphical abstract 32 | 33 | Introduction 34 | The biomedical literature is a primary resource to address information needs across the biological and clinical sciences (1), 35 | however the requirements for literature search vary widely. 36 | Activities such as formulating a research hypothesis require 37 | an exploratory approach, whereas tasks like interpreting the 38 | clinical significance of genetic variants are more focused. 39 | Traditional keyword-based search methods have long 40 | formed the foundation of biomedical literature search (2). 41 | While generally effective for basic search, these methods also 42 | have significant limitations, such as missing relevant articles 43 | 44 | due to differing terminology or including irrelevant articles because surface-level term matches cannot adequately represent 45 | the required association between query terms. These limitations cost time and risk information needs remaining unmet. 46 | Natural language processing (NLP) methods provide substantial value for creating bioinformatics resources (3–5), and 47 | may improve literature search by enabling semantic and relation search (6). In semantic search, users indicate specific 48 | concepts of interest (entities) for which the system has precomputed matches regardless of the terminology used. Relation search increases precision by allowing users to specify the 49 | 50 | Received: January 18, 2024. Revised: March 2, 2024. Editorial Decision: March 16, 2024. Accepted: March 21, 2024 51 | Published by Oxford University Press on behalf of Nucleic Acids Research 2024. 52 | This work is written by (a) US Government employee(s) and is in the public domain in the US. 53 | 54 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025 55 | 56 | * 57 | 58 | W541 59 | 60 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue 61 | 62 | type of relationship desired between entities, such as whether 63 | a chemical enhances or reduces expression of a gene. In this regard, we present PubTator 3.0, a novel resource engineered to 64 | support semantic and relation search in the biomedical literature. Its search capabilities allow users to explore automated 65 | entity annotations for six key biomedical entities: genes, diseases, chemicals, genetic variants, species, and cell lines. PubTator 3.0 also identifies and makes searchable 12 common 66 | types of relations between entities, enhancing its utility for 67 | both targeted and exploratory searches. Focusing on relations 68 | and entity types of interest across the biomedical sciences allows PubTator 3.0 to retrieve information precisely while providing broad utility (see detailed comparisons with its predecessor in Supplementary Table S1). 69 | 70 | The PubTator 3.0 online interface, illustrated in Figure 1 71 | and Supplementary Figure S1, is designed for interactive literature exploration, supporting semantic, relation, keyword, 72 | and Boolean queries. An auto-complete function provides semantic search suggestions to assist users with query formulation. For example, it automatically suggests replacing either ‘COVID-19 or "SARS-CoV-2 infection’ with the semantic term ‘@DISEASE_COVID_19 . Relation queries – new to 73 | PubTator 3.0 – provide increased precision, allowing users 74 | to target articles which discuss specific relationships between 75 | entities. 76 | PubTator 3.0 offers unified search results, simultaneously 77 | searching approximately 36 million PubMed abstracts and 78 | over 6 million full-text articles from the PMC Open Access Subset (PMC-OA), improving access to the substantial 79 | amount of relevant information present in the article full text 80 | (7). Search results are prioritized based on the depth of the relationship between the query terms: articles containing identifiable relations between semantic terms receive the highest 81 | priority, while articles where semantic or keyword terms cooccur nearby (e.g. within the same sentence) receive secondary 82 | priority. Search results are also prioritized based on the article 83 | section where the match appears (e.g. matches within the title receive higher priority). Users can further refine results by 84 | employing filters, narrowing articles returned to specific publication types, journals, or article sections. 85 | PubTator 3.0 is supported by an NLP pipeline, depicted in 86 | Figure 2A. This pipeline, run weekly, first identifies articles 87 | newly added to PubMed and PMC-OA. Articles are then processed through three major steps: (i) named entity recognition, 88 | provided by the recently developed deep-learning transformer 89 | model AIONER (8), (ii) identifier mapping and (iii) relation 90 | extraction, performed by BioREx (9) of 12 common types of 91 | relations (described in Supplementary Table S2). 92 | In total, PubTator 3.0 contains over 1.6 billion entity annotations (4.6 million unique identifiers) and 33 million relations 93 | (8.8 million unique pairs). It provides enhanced entity recognition and normalization performance over its previous version, 94 | PubTator 2 (10), also known as PubTator Central (Figure 2B 95 | and Supplementary Table S3). We show the relation extraction performance of PubTator 3.0 in Figure 2C and its comparison results to the previous state-of-the-art systems (11–13) 96 | on the BioCreative V Chemical-Disease Relation (14) corpus, 97 | finding that PubTator 3.0 provided substantially higher accuracy. Moreover, when evaluating a randomized sample of 98 | entity pair queries compared to PubMed and Google Scholar, 99 | 100 | Materials and methods 101 | Data sources and article processing 102 | PubTator 3.0 downloads new articles weekly from the BioC 103 | PubMed API (https://www.ncbi.nlm.nih.gov/research/bionlp/ 104 | APIs/BioC-PubMed/) and the BioC PMC API (https://www. 105 | ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/) in BioCXML format (16). Local abbreviations are identified using 106 | Ab3P (17). Article text and extracted data are stored internally using MongoDB and indexed for search with Solr, ensuring robust and scalable accessibility unconstrained by external 107 | dependencies such as the NCBI eUtils API. 108 | 109 | Entity recognition and normalization/linking 110 | PubTator 3.0 uses AIONER (8), a recently developed named 111 | entity recognition (NER) model, to recognize entities of six 112 | types: genes/proteins, chemicals, diseases, species, genetic 113 | variants, and cell lines. AIONER utilizes a flexible tagging 114 | scheme to integrate training data created separately into a 115 | single resource. These training datasets include NLM-Gene 116 | (18), NLM-Chem (19), NCBI-Disease (20), BC5CDR (14), 117 | tmVar3 (21), Species-800 (22), BioID (23) and BioRED (15). 118 | This consolidation creates a larger training set, improving 119 | the model’s ability to generalize to unseen data. Furthermore, 120 | it enables recognizing multiple entity types simultaneously, 121 | enhancing efficiency and simplifying the challenge of distinguishing boundaries between entities that reference others, 122 | such as the disorder ‘Alpha-1 antitrypsin deficiency’ and the 123 | protein ‘Alpha-1 antitrypsin’. We previously evaluated the performance of AIONER on 14 benchmark datasets (8), including the test sets for the aforementioned training sets. This evaluation demonstrated that AIONER’s performance surpasses 124 | or matches previous state-of-the-art methods. 125 | Entity mentions found by AIONER are normalized (linked) 126 | to a unique identifier in an appropriate entity database. Normalization is performed by a module designed for (or adapted 127 | to) each entity type, using the latest version. The recentlyupgraded GNorm2 system (24) normalizes genes to NCBI 128 | Gene identifiers and species mentions to NCBI Taxonomy. 129 | tmVar3 (21), also recently upgraded, normalizes genetic variants; it uses dbSNP identifiers for variants listed in dbSNP 130 | and HGNV format otherwise. Chemicals are normalized by 131 | the NLM-Chem tagger (19) to MeSH identifiers (25). TaggerOne (26) normalizes diseases to MeSH and cell lines to 132 | Cellosaurus (27) using a new normalization-only mode. This 133 | mode only applies the normalization model, which converts 134 | both mentions and lexicon names into high-dimensional TFIDF vectors and learns a mapping, as before. However, it 135 | now augments the training data by mapping each lexicon 136 | name to itself, resulting in a large performance improvement for names present in the lexicon but not in the annotated training data. These enhancements provide a significant overall improvement in entity normalization performance (Supplementary Table S3). 137 | 138 | Relation extraction 139 | Relations for PubTator 3.0 are extracted by the unified relation extraction model BioREx (9), designed to simulta- 140 | 141 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025 142 | 143 | System overview 144 | 145 | PubTator 3.0 consistently returns a greater number of articles with higher precision in the top 20 results (Figure 2D and 146 | Supplementary Table S4). 147 | 148 | W542 149 | 150 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue 151 | 152 | neously extract 12 types of relations across eight entity 153 | type pairs: chemical–chemical, chemical–disease, chemical– 154 | gene, chemical–variant, disease–gene, disease–variant, gene– 155 | gene and variant–variant. Detailed definitions of these relation types and their corresponding entity pairs are presented in 156 | Supplementary Table S2. Deep-learning methods for relation 157 | extraction, such as BioREx, require ample training data. However, training data for relation extraction is fragmented into 158 | many datasets, often tailored to specific entity pairs. BioREx 159 | overcomes this limitation with a data-centric approach, reconciling discrepancies between disparate training datasets to 160 | construct a comprehensive, unified dataset. 161 | We evaluated the relations extracted by BioREx using performance on manually annotated relation extraction datasets 162 | as well as a comparative analysis between BioREx and notable 163 | comparable systems. BioREx established a new performance 164 | benchmark on the BioRED corpus test set (15), elevating the 165 | performance from 74.4% (F-score) to 79.6%, and demonstrating higher performance than alternative models such as 166 | transfer learning (TL), multi-task learning (MTL), and stateof-the-art models trained on isolated datasets (9). For PubTator 3.0, we replaced its deep learning module, PubMedBERT 167 | (28), with LinkBERT (29), further increasing the performance 168 | to 82.0%. Furthermore, we conducted a comparative analysis between BioREx and SemRep (11), a widely used rule- 169 | 170 | based method for extracting diverse relations, the CD-REST 171 | (13) system, and the previous state-of-the-art system (12), using the BioCreative V Chemical Disease Relation corpus test 172 | set (14). Our evaluation demonstrated that PubTator 3.0 provided substantially higher F-score than previous methods. 173 | 174 | Programmatic access and data formats 175 | PubTator 3.0 offers programmatic access through its 176 | API and bulk download. The API (https://www.ncbi. 177 | nlm.nih.gov/research/pubtator3/) supports keyword, entity and relation search, and also supports exporting 178 | annotations in XML and JSON-based BioC (16) formats and tab-delimited free text. The PubTator 3.0 FTP 179 | site (https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3) provides bulk downloads of annotated articles and extraction 180 | summaries for entities and relations. Programmatic access supports more flexible query options; for example, 181 | the information need ‘what chemicals reduce expression 182 | of JAK1?’ can be answered directly via API (e.g. https: 183 | //www.ncbi.nlm.nih.gov/research/pubtator3-api/relations? 184 | e1=@GENE_JAK1&type=negative_correlate&e2=Chemical) 185 | or by filtering the bulk relations file. Additionally, the PubTator 3.0 API supports annotation of user-defined free 186 | text. 187 | 188 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025 189 | 190 | Figure 1. PubTator 3.0 system overview and search results page: 1. Query auto-complete enhances search accuracy and synonym matching. 2. Natural 191 | language processing (NLP)-enhanced relevance: Search results are prioritized according to the strength of the relationship between the entities queried. 192 | 3. Users can further refine results with facet filters—section, journal and type. 4. Search results include highlighted entity snippets explaining relevance. 193 | 5. Histogram visualizes number of results by publication year. 6. Entity highlighting can be switched on or off according to user preference. 194 | 195 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue 196 | 197 | W543 198 | 199 | Case study I: entity relation queries 200 | We analyzed the retrieval quality of PubTator 3.0 by preparing a series of 12 entity pairs to serve as case studies for 201 | comparison between PubTator 3.0, PubMed and Google 202 | Scholar. To provide an equal comparison, we filtered about 203 | 30% of the Google Scholar results for articles not present 204 | in PubMed. To ensure that the number of results would 205 | remain low enough to allow filtering Google Scholar results for articles not in PubMed, we identified entity pairs 206 | first discussed together in the literature in 2022 or later. We 207 | then randomly selected two entity pairs of each of the following types: disease/gene, chemical/disease, chemical/gene, 208 | chemical/chemical, gene/gene and disease/variant. None of 209 | 210 | the relation pairs selected appears in the training set. The 211 | comparison was performed with respect to a snapshot of the 212 | search results returned by all search engines on 19 May 2023. 213 | We manually evaluated the top 20 results for each system and 214 | each query; articles were judged to be relevant if they mentioned both entities in the query and supported a relationship 215 | between them. Two curators independently judged each article, and discrepancies were discussed until agreement. The 216 | curators were not blinded to the retrieval method but were 217 | required to record the text supporting the relationship, if relevant. This experiment evaluated the relevance of the top 20 218 | results for each retrieval method, regardless of whether the 219 | article appeared in PubMed. 220 | 221 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025 222 | 223 | Figure 2. (A) The PubTator 3.0 processing pipeline: AIONER (8) identifies six types of entities in PubMed abstracts and PMC-OA full-text articles. Entity 224 | annotations are associated with database identifiers by specialized mappers and BioREx (9) identifies relations between entities. Extracted data is 225 | stored in MongoDB and made searchable using Solr. (B) Entity recognition performance for each entity type compared with PubTator2 (also known as 226 | PubTatorCentral) (13) on the BioRED corpus (15). (C) Relation extraction performance compared with SemRep (11) and notable previous best systems 227 | (12,13) on the BioCreative V Chemical-Disease Relation (14) corpus. (D) Comparison of information retrieval for PubTator 3.0, PubMed, and Google 228 | Scholar for entity pair queries, with respect to total article count and top-20 article precision. 229 | 230 | W544 231 | 232 | Case study II: retrieval-augmented generation 233 | In the era of large language models (LLMs), PubTator 3.0 can 234 | also enhance their factual accuracy via retrieval augmented 235 | generation. Despite their strong language ability, LLMs are 236 | prone to generating incorrect assertions, sometimes known 237 | as hallucinations (30,31). For example, when requested to 238 | cite sources for questions such as ‘which diseases can doxorubicin treat’, GPT-4 frequently provides seemingly plausible but nonexistent references. Augmenting GPT-4 with PubTator 3.0 APIs can anchor the model’s response to verifiable 239 | references via the extracted relations, significantly reducing 240 | hallucinations. 241 | We assessed the citation accuracy of responses from three 242 | GPT-4 variations: PubTator-augmented GPT-4, PubMedaugmented GPT-4 and standard GPT-4. We performed a qualitative evaluation based on eight questions selected as follows. We identified entities mentioned in the PubMed query 243 | logs and randomly selected from entities searched both frequently and rarely. We then identified the common queries for 244 | each entity that request relational information and adapted 245 | one into a natural language question. Each question is therefore grounded on common information needs of real PubMed 246 | users. For example, the questions ‘What can be caused by 247 | tocilizumab?’ and ‘What can be treated by doxorubicin?’ 248 | are adapted from the user queries ‘tocilizumab side effects’ 249 | and ‘doxorubicin treatment’ respectively. Such questions typically require extracting information from multiple articles 250 | and an understanding of biomedical entities and relationship descriptions. Supplementary Table S5 lists the questions 251 | chosen. 252 | We augmented the GPT-4 large language model (LLM) with 253 | PubTator 3.0 via the function calling mechanism of the OpenAI ChatCompletion API. This integration involved prompt- 254 | 255 | ing GPT-4 with descriptions of three PubTator APIs: (i) find 256 | entity ID, which retrieves PubTator entity identifiers; (ii) find 257 | related entities, which identifies related entities based on an 258 | input entity and specified relations and (iii) export relevant 259 | search results, which returns PubMed article identifiers containing textual evidence for specific entity relationships. Our 260 | instructions prompted GPT-4 to decompose user questions 261 | into sub-questions addressable by these APIs, execute the 262 | function calls, and synthesize the responses into a coherent final answer. Our prompt promoted a summarized response by 263 | instructing GPT-4 to start its message with ‘Summary:’ and requested the response include citations to the articles providing 264 | evidence. The PubMed augmentation experiments provided 265 | GPT-4 with access to PubMed database search via the National Center for Biotechnology Information (NCBI) E-utils 266 | APIs (32). We used Azure OpenAI Services (version 2023-0701-preview) and GPT-4 (version 2023-06-13) and set the decoding temperature to zero to obtain deterministic outputs. 267 | The full prompts are provided in Supplementary Table S6. 268 | PubTator-augmented GPT-4 generally processed the questions in three steps: (i) finding the standard entity identifiers, (ii) finding its related entity identifiers and (iii) searching PubMed articles. For example, to answer ‘What drugs can 269 | treat breast cancer?’, GPT-4 first found the PubTator entity 270 | identifier for breast cancer (@DISEASE_Breast_Cancer) using 271 | the Find Entity ID API. It then used the Find Related Entities 272 | API to identify entities related to @DISEASE_Breast_Cancer 273 | through a ‘treat’ relation. For demonstration purposes, we 274 | limited the maximum number of output entities to five. Finally, 275 | GPT-4 called the Export Relevant Search Results API for the 276 | PubMed article identifiers containing evidence for these relationships. The raw responses to each prompt for each method 277 | are provided in Supplementary Table S6. 278 | We manually evaluated the accuracy of the citations in 279 | the responses by reviewing each PubMed article and verifying whether each PubMed article cited supported the 280 | stated relationship (e.g. Tamoxifen treating breast cancer). 281 | Supplementary Table S5 reports the proportion of the cited 282 | articles with valid supporting evidence for each method. GPT4 frequently generated fabricated citations, widely known 283 | as the hallucination issue. While PubMed-augmented GPT-4 284 | showed a higher proportion of accurate citations, some articles cited did not support the relation claims. This is likely 285 | because PubMed is based on keyword and Boolean search and 286 | does not support queries for specific relationships. Responses 287 | generated by PubTator-augmented GPT-4 demonstrated the 288 | highest level of citation accuracy, underscoring the potential of PubTator 3.0 as a high-quality knowledge source for 289 | addressing biomedical information needs through retrievalaugmented generation with LLMs such as GPT-4. In our experiment, using Azure for ChatGPT, the cost was approximately $1 for two questions with GPT-4-Turbo, or 40 questions when downgraded to GPT-3.5-Turbo, including the cost 290 | of input/output tokens. 291 | 292 | Discussion 293 | Previous versions of PubTator have fulfilled over one billion 294 | API requests since 2015, supporting a wide range of research 295 | applications. Numerous studies have harnessed PubTator annotations for disease-specific gene research, including efforts 296 | to prioritize candidate genes (33), determine gene–phenotype 297 | associations (34), and identify the genetic underpinnings of 298 | 299 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025 300 | 301 | Our analysis is summarized in Figure 2D, and 302 | Supplementary Table S4 presents a detailed comparison 303 | of the quality of retrieved results between PubTator 3.0, 304 | PubMed and Google Scholar. Our results demonstrate that 305 | PubTator 3.0 retrieves a greater number of articles than the 306 | comparison systems and its precision is higher for the top 307 | 20 results. For instance, PubTator 3.0 returned 346 articles 308 | for the query ‘GLPG0634 + ulcerative colitis’, and manual 309 | review of the top 20 articles showed that all contained 310 | statements about an association between GLPG0634 and 311 | ulcerative colitis. In contrast, PubMed only returned a total 312 | of 18 articles, with only 12 mentioning an association. Moreover, when searching for ‘COVID19 + PON1’, PubTator 3.0 313 | returns 212 articles in PubMed, surpassing the 43 articles 314 | obtained from Google Scholar, only 29 of which are sourced 315 | from PubMed. These disparities can be attributed to several 316 | factors: (i) PubTator 3.0’s search includes full texts available 317 | in PMC-OA, resulting in significantly broader coverage of 318 | articles, (ii) entity normalization improves recall, for example, 319 | by matching ‘paraoxonase 1’ to ‘PON1’, (iii) PubTator 3.0 320 | prioritizes articles containing relations between the query 321 | entities, (iv) Pubtator 3.0 prioritizes articles where the entities 322 | appear nearby, rather than distant paragraphs. Across the 12 323 | information retrieval case studies, PubTator 3.0 demonstrated 324 | an overall precision of 90.0% for the top 20 articles (216 out 325 | of 240), which is significantly higher than PubMed’s precision 326 | of 81.6% (84 out of 103) and Google Scholar’s precision of 327 | 48.5% (98 out of 202). 328 | 329 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue 330 | 331 | W545 332 | 333 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue 334 | 335 | Conclusion 336 | PubTator 3.0 offers a comprehensive set of features and tools 337 | that allow researchers to navigate the ever-expanding wealth 338 | of biomedical literature, expediting research and unlocking 339 | valuable insights for scientific discovery. The PubTator 3.0 interface, API, and bulk file downloads are available at https: 340 | //www.ncbi.nlm.nih.gov/research/pubtator3/. 341 | 342 | Data availability 343 | Data is available through the online interface at https:// 344 | www.ncbi.nlm.nih.gov/research/pubtator3/, through the API 345 | at https://www.ncbi.nlm.nih.gov/research/pubtator3/api or 346 | bulk FTP download at https://ftp.ncbi.nlm.nih.gov/pub/lu/ 347 | PubTator3/. 348 | The source code for each component of PubTator 3.0 349 | is openly accessible. The AIONER named entity recognizer 350 | is available at https://github.com/ncbi/AIONER. GNorm2, 351 | for gene name normalization, is available at https://github. 352 | com/ncbi/GNorm2. The tmVar3 variant name normalizer 353 | is available at https://github.com/ncbi/tmVar3. The NLMChem Tagger, for chemical name normalization, is available 354 | at https://ftp.ncbi.nlm.nih.gov/pub/lu/NLMChem. The TaggerOne system, for disease and cell line normalization, is available at https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/ 355 | taggerone. The BioREx relation extraction system is available 356 | at https://github.com/ncbi/BioREx. The code for customizing 357 | ChatGPT with the PubTator 3.0 API is available at https: 358 | //github.com/ncbi-nlp/pubtator-gpt. The details of the applications, performance, evaluation data, and citations for each 359 | tool are shown in Supplementary Table S7. All source code is 360 | also available at https://doi.org/10.5281/zenodo.10839630. 361 | 362 | Supplementary data 363 | Supplementary Data are available at NAR Online. 364 | 365 | Funding 366 | Intramural Research Program of the National Library of 367 | Medicine (NLM), National Institutes of Health; ODSS Support of the Exploration of Cloud in NIH Intramural Research. 368 | Funding for open access charge: Intramural Research Program 369 | of the National Library of Medicine, National Institutes of 370 | Health. 371 | 372 | Conflict of interest statement 373 | None declared. 374 | ``` -------------------------------------------------------------------------------- /src/biomcp/utils/endpoint_registry.py: -------------------------------------------------------------------------------- ```python 1 | """Registry for tracking all external HTTP endpoints used by BioMCP.""" 2 | 3 | from dataclasses import dataclass, field 4 | from enum import Enum 5 | from pathlib import Path 6 | from typing import Any 7 | from urllib.parse import urlparse 8 | 9 | 10 | class EndpointCategory(str, Enum): 11 | """Categories of external endpoints.""" 12 | 13 | BIOMEDICAL_LITERATURE = "biomedical_literature" 14 | CLINICAL_TRIALS = "clinical_trials" 15 | VARIANT_DATABASES = "variant_databases" 16 | CANCER_GENOMICS = "cancer_genomics" 17 | HEALTH_MONITORING = "health_monitoring" 18 | REGULATORY_DATA = "regulatory_data" 19 | 20 | 21 | class DataType(str, Enum): 22 | """Types of data accessed from endpoints.""" 23 | 24 | RESEARCH_ARTICLES = "research_articles" 25 | CLINICAL_TRIAL_DATA = "clinical_trial_data" 26 | GENETIC_VARIANTS = "genetic_variants" 27 | CANCER_MUTATIONS = "cancer_mutations" 28 | GENE_ANNOTATIONS = "gene_annotations" 29 | SERVICE_STATUS = "service_status" 30 | ADVERSE_EVENTS = "adverse_events" 31 | DRUG_LABELS = "drug_labels" 32 | DEVICE_EVENTS = "device_events" 33 | 34 | 35 | @dataclass 36 | class EndpointInfo: 37 | """Information about an external endpoint.""" 38 | 39 | url: str 40 | category: EndpointCategory 41 | data_types: list[DataType] = field(default_factory=list) 42 | description: str = "" 43 | compliance_notes: str = "" 44 | rate_limit: str | None = None 45 | authentication: str | None = None 46 | 47 | @property 48 | def domain(self) -> str: 49 | """Extract domain from URL.""" 50 | parsed = urlparse(self.url) 51 | return parsed.netloc 52 | 53 | 54 | class EndpointRegistry: 55 | """Registry for tracking all external endpoints.""" 56 | 57 | def __init__(self): 58 | self._endpoints: dict[str, EndpointInfo] = {} 59 | self._initialize_known_endpoints() 60 | 61 | def _initialize_known_endpoints(self): 62 | """Initialize registry with known endpoints.""" 63 | # PubMed/PubTator3 64 | self.register( 65 | "pubtator3_search", 66 | EndpointInfo( 67 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/", 68 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 69 | data_types=[DataType.RESEARCH_ARTICLES], 70 | description="PubTator3 API for searching biomedical literature with entity annotations", 71 | compliance_notes="Public NIH/NCBI service, no PII transmitted", 72 | rate_limit="20 requests/second", 73 | ), 74 | ) 75 | 76 | self.register( 77 | "pubtator3_export", 78 | EndpointInfo( 79 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson", 80 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 81 | data_types=[DataType.RESEARCH_ARTICLES], 82 | description="PubTator3 API for fetching full article annotations in BioC-JSON format", 83 | compliance_notes="Public NIH/NCBI service, no PII transmitted", 84 | rate_limit="20 requests/second", 85 | ), 86 | ) 87 | 88 | self.register( 89 | "pubtator3_autocomplete", 90 | EndpointInfo( 91 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/entity/autocomplete/", 92 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 93 | data_types=[DataType.GENE_ANNOTATIONS], 94 | description="PubTator3 API for entity name autocomplete suggestions", 95 | compliance_notes="Public NIH/NCBI service, no PII transmitted", 96 | rate_limit="20 requests/second", 97 | ), 98 | ) 99 | 100 | # ClinicalTrials.gov 101 | self.register( 102 | "clinicaltrials_search", 103 | EndpointInfo( 104 | url="https://clinicaltrials.gov/api/v2/studies", 105 | category=EndpointCategory.CLINICAL_TRIALS, 106 | data_types=[DataType.CLINICAL_TRIAL_DATA], 107 | description="ClinicalTrials.gov API v2 for searching clinical trials", 108 | compliance_notes="Public NIH service, may contain trial participant criteria", 109 | rate_limit="10 requests/second", 110 | ), 111 | ) 112 | 113 | # MyVariant.info 114 | self.register( 115 | "myvariant_query", 116 | EndpointInfo( 117 | url="https://myvariant.info/v1/query", 118 | category=EndpointCategory.VARIANT_DATABASES, 119 | data_types=[DataType.GENETIC_VARIANTS], 120 | description="MyVariant.info API for querying genetic variants", 121 | compliance_notes="Public service aggregating variant databases, no patient data", 122 | rate_limit="1000 requests/hour (anonymous)", 123 | ), 124 | ) 125 | 126 | self.register( 127 | "myvariant_variant", 128 | EndpointInfo( 129 | url="https://myvariant.info/v1/variant", 130 | category=EndpointCategory.VARIANT_DATABASES, 131 | data_types=[DataType.GENETIC_VARIANTS], 132 | description="MyVariant.info API for fetching specific variant details", 133 | compliance_notes="Public service aggregating variant databases, no patient data", 134 | rate_limit="1000 requests/hour (anonymous)", 135 | ), 136 | ) 137 | 138 | # Preprint servers 139 | self.register( 140 | "biorxiv_api", 141 | EndpointInfo( 142 | url="https://api.biorxiv.org/details/biorxiv", 143 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 144 | data_types=[DataType.RESEARCH_ARTICLES], 145 | description="bioRxiv API for searching biology preprints", 146 | compliance_notes="Public preprint server, no PII transmitted", 147 | rate_limit="Not specified", 148 | ), 149 | ) 150 | 151 | self.register( 152 | "medrxiv_api", 153 | EndpointInfo( 154 | url="https://api.biorxiv.org/details/medrxiv", 155 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 156 | data_types=[DataType.RESEARCH_ARTICLES], 157 | description="medRxiv API for searching medical preprints", 158 | compliance_notes="Public preprint server, no PII transmitted", 159 | rate_limit="Not specified", 160 | ), 161 | ) 162 | 163 | self.register( 164 | "europe_pmc", 165 | EndpointInfo( 166 | url="https://www.ebi.ac.uk/europepmc/webservices/rest/search", 167 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 168 | data_types=[DataType.RESEARCH_ARTICLES], 169 | description="Europe PMC REST API for searching biomedical literature", 170 | compliance_notes="Public EMBL-EBI service, no PII transmitted", 171 | rate_limit="Not specified", 172 | ), 173 | ) 174 | 175 | # External variant sources 176 | self.register( 177 | "gdc_ssms", 178 | EndpointInfo( 179 | url="https://api.gdc.cancer.gov/ssms", 180 | category=EndpointCategory.VARIANT_DATABASES, 181 | data_types=[DataType.CANCER_MUTATIONS], 182 | description="NCI GDC API for somatic mutations", 183 | compliance_notes="Public NCI service, aggregate cancer genomics data", 184 | rate_limit="Not specified", 185 | ), 186 | ) 187 | 188 | self.register( 189 | "gdc_ssm_occurrences", 190 | EndpointInfo( 191 | url="https://api.gdc.cancer.gov/ssm_occurrences", 192 | category=EndpointCategory.VARIANT_DATABASES, 193 | data_types=[DataType.CANCER_MUTATIONS], 194 | description="NCI GDC API for mutation occurrences in cancer samples", 195 | compliance_notes="Public NCI service, aggregate cancer genomics data", 196 | rate_limit="Not specified", 197 | ), 198 | ) 199 | 200 | self.register( 201 | "ensembl_variation", 202 | EndpointInfo( 203 | url="https://rest.ensembl.org/variation/human", 204 | category=EndpointCategory.VARIANT_DATABASES, 205 | data_types=[DataType.GENETIC_VARIANTS], 206 | description="Ensembl REST API for human genetic variation data", 207 | compliance_notes="Public EMBL-EBI service, population genetics data", 208 | rate_limit="15 requests/second", 209 | ), 210 | ) 211 | 212 | self.register( 213 | "cbioportal_api", 214 | EndpointInfo( 215 | url="https://www.cbioportal.org/api", 216 | category=EndpointCategory.CANCER_GENOMICS, 217 | data_types=[ 218 | DataType.CANCER_MUTATIONS, 219 | DataType.CLINICAL_TRIAL_DATA, 220 | ], 221 | description="cBioPortal API for cancer genomics data", 222 | compliance_notes="Public MSKCC/Dana-Farber service, aggregate cancer genomics", 223 | rate_limit="5 requests/second", 224 | authentication="Optional API token for increased rate limits", 225 | ), 226 | ) 227 | 228 | # Specific cBioPortal endpoints 229 | self.register( 230 | "cbioportal_genes", 231 | EndpointInfo( 232 | url="https://www.cbioportal.org/api/genes", 233 | category=EndpointCategory.CANCER_GENOMICS, 234 | data_types=[DataType.GENE_ANNOTATIONS], 235 | description="cBioPortal API for gene information", 236 | compliance_notes="Public MSKCC/Dana-Farber service, gene metadata", 237 | rate_limit="5 requests/second", 238 | ), 239 | ) 240 | 241 | self.register( 242 | "cbioportal_cancer_types", 243 | EndpointInfo( 244 | url="https://www.cbioportal.org/api/cancer-types", 245 | category=EndpointCategory.CANCER_GENOMICS, 246 | data_types=[DataType.CANCER_MUTATIONS], 247 | description="cBioPortal API for cancer type hierarchy", 248 | compliance_notes="Public MSKCC/Dana-Farber service, cancer type metadata", 249 | rate_limit="5 requests/second", 250 | ), 251 | ) 252 | 253 | self.register( 254 | "cbioportal_molecular_profiles", 255 | EndpointInfo( 256 | url="https://www.cbioportal.org/api/molecular-profiles", 257 | category=EndpointCategory.CANCER_GENOMICS, 258 | data_types=[DataType.CANCER_MUTATIONS], 259 | description="cBioPortal API for molecular profiles", 260 | compliance_notes="Public MSKCC/Dana-Farber service, study metadata", 261 | rate_limit="5 requests/second", 262 | ), 263 | ) 264 | 265 | self.register( 266 | "cbioportal_mutations", 267 | EndpointInfo( 268 | url="https://www.cbioportal.org/api/mutations", 269 | category=EndpointCategory.CANCER_GENOMICS, 270 | data_types=[DataType.CANCER_MUTATIONS], 271 | description="cBioPortal API for mutation data", 272 | compliance_notes="Public MSKCC/Dana-Farber service, aggregate mutation data", 273 | rate_limit="5 requests/second", 274 | ), 275 | ) 276 | 277 | self.register( 278 | "cbioportal_studies", 279 | EndpointInfo( 280 | url="https://www.cbioportal.org/api/studies", 281 | category=EndpointCategory.CANCER_GENOMICS, 282 | data_types=[ 283 | DataType.CLINICAL_TRIAL_DATA, 284 | DataType.CANCER_MUTATIONS, 285 | ], 286 | description="cBioPortal API for cancer studies", 287 | compliance_notes="Public MSKCC/Dana-Farber service, study metadata", 288 | rate_limit="5 requests/second", 289 | ), 290 | ) 291 | 292 | # BioThings Suite APIs 293 | self.register( 294 | "mygene_query", 295 | EndpointInfo( 296 | url="https://mygene.info/v3/query", 297 | category=EndpointCategory.VARIANT_DATABASES, 298 | data_types=[DataType.GENE_ANNOTATIONS], 299 | description="MyGene.info API for querying gene information", 300 | compliance_notes="Public BioThings service, gene annotation data", 301 | rate_limit="10 requests/second", 302 | ), 303 | ) 304 | 305 | self.register( 306 | "mygene_gene", 307 | EndpointInfo( 308 | url="https://mygene.info/v3/gene", 309 | category=EndpointCategory.VARIANT_DATABASES, 310 | data_types=[DataType.GENE_ANNOTATIONS], 311 | description="MyGene.info API for fetching specific gene details", 312 | compliance_notes="Public BioThings service, gene annotation data", 313 | rate_limit="10 requests/second", 314 | ), 315 | ) 316 | 317 | self.register( 318 | "mydisease_query", 319 | EndpointInfo( 320 | url="https://mydisease.info/v1/query", 321 | category=EndpointCategory.VARIANT_DATABASES, 322 | data_types=[DataType.GENE_ANNOTATIONS], 323 | description="MyDisease.info API for querying disease information", 324 | compliance_notes="Public BioThings service, disease ontology data", 325 | rate_limit="10 requests/second", 326 | ), 327 | ) 328 | 329 | self.register( 330 | "mydisease_disease", 331 | EndpointInfo( 332 | url="https://mydisease.info/v1/disease", 333 | category=EndpointCategory.VARIANT_DATABASES, 334 | data_types=[DataType.GENE_ANNOTATIONS], 335 | description="MyDisease.info API for fetching specific disease details", 336 | compliance_notes="Public BioThings service, disease ontology data", 337 | rate_limit="10 requests/second", 338 | ), 339 | ) 340 | 341 | self.register( 342 | "mychem_query", 343 | EndpointInfo( 344 | url="https://mychem.info/v1/query", 345 | category=EndpointCategory.VARIANT_DATABASES, 346 | data_types=[DataType.GENE_ANNOTATIONS], 347 | description="MyChem.info API for querying drug/chemical information", 348 | compliance_notes="Public BioThings service, drug/chemical annotation data", 349 | rate_limit="10 requests/second", 350 | ), 351 | ) 352 | 353 | self.register( 354 | "mychem_chem", 355 | EndpointInfo( 356 | url="https://mychem.info/v1/chem", 357 | category=EndpointCategory.VARIANT_DATABASES, 358 | data_types=[DataType.GENE_ANNOTATIONS], 359 | description="MyChem.info API for fetching specific drug/chemical details", 360 | compliance_notes="Public BioThings service, drug/chemical annotation data", 361 | rate_limit="10 requests/second", 362 | ), 363 | ) 364 | 365 | # NCI Clinical Trials Search API 366 | self.register( 367 | "nci_trials", 368 | EndpointInfo( 369 | url="https://clinicaltrialsapi.cancer.gov/api/v2/trials", 370 | category=EndpointCategory.CLINICAL_TRIALS, 371 | data_types=[DataType.CLINICAL_TRIAL_DATA], 372 | description="NCI Clinical Trials Search API for cancer trials", 373 | compliance_notes="Public NCI service, cancer trial data", 374 | rate_limit="Not specified", 375 | authentication="Optional NCI_API_KEY for increased access", 376 | ), 377 | ) 378 | 379 | self.register( 380 | "nci_organizations", 381 | EndpointInfo( 382 | url="https://clinicaltrialsapi.cancer.gov/api/v2/organizations", 383 | category=EndpointCategory.CLINICAL_TRIALS, 384 | data_types=[DataType.CLINICAL_TRIAL_DATA], 385 | description="NCI API for cancer research organizations", 386 | compliance_notes="Public NCI service, organization metadata", 387 | rate_limit="Not specified", 388 | authentication="Optional NCI_API_KEY for increased access", 389 | ), 390 | ) 391 | 392 | self.register( 393 | "nci_diseases", 394 | EndpointInfo( 395 | url="https://clinicaltrialsapi.cancer.gov/api/v2/diseases", 396 | category=EndpointCategory.CLINICAL_TRIALS, 397 | data_types=[DataType.CLINICAL_TRIAL_DATA], 398 | description="NCI API for cancer disease vocabulary", 399 | compliance_notes="Public NCI service, disease ontology", 400 | rate_limit="Not specified", 401 | authentication="Optional NCI_API_KEY for increased access", 402 | ), 403 | ) 404 | 405 | self.register( 406 | "nci_interventions", 407 | EndpointInfo( 408 | url="https://clinicaltrialsapi.cancer.gov/api/v2/interventions", 409 | category=EndpointCategory.CLINICAL_TRIALS, 410 | data_types=[DataType.CLINICAL_TRIAL_DATA], 411 | description="NCI API for cancer treatment interventions", 412 | compliance_notes="Public NCI service, intervention metadata", 413 | rate_limit="Not specified", 414 | authentication="Optional NCI_API_KEY for increased access", 415 | ), 416 | ) 417 | 418 | self.register( 419 | "nci_biomarkers", 420 | EndpointInfo( 421 | url="https://clinicaltrialsapi.cancer.gov/api/v2/biomarkers", 422 | category=EndpointCategory.CLINICAL_TRIALS, 423 | data_types=[DataType.CLINICAL_TRIAL_DATA], 424 | description="NCI API for biomarkers used in clinical trials", 425 | compliance_notes="Public NCI service, biomarker metadata", 426 | rate_limit="Not specified", 427 | authentication="Optional NCI_API_KEY for increased access", 428 | ), 429 | ) 430 | 431 | # OpenFDA APIs 432 | self.register( 433 | "openfda_drug_events", 434 | EndpointInfo( 435 | url="https://api.fda.gov/drug/event.json", 436 | category=EndpointCategory.REGULATORY_DATA, 437 | data_types=[DataType.ADVERSE_EVENTS], 438 | description="FDA Adverse Event Reporting System (FAERS) for drug safety data", 439 | compliance_notes="Public FDA service, voluntary adverse event reports, no PII", 440 | rate_limit="40 requests/minute (240 with API key)", 441 | authentication="Optional OPENFDA_API_KEY for increased rate limits", 442 | ), 443 | ) 444 | 445 | self.register( 446 | "openfda_drug_labels", 447 | EndpointInfo( 448 | url="https://api.fda.gov/drug/label.json", 449 | category=EndpointCategory.REGULATORY_DATA, 450 | data_types=[DataType.DRUG_LABELS], 451 | description="FDA Structured Product Labeling (SPL) for drug prescribing information", 452 | compliance_notes="Public FDA service, official drug labeling data", 453 | rate_limit="40 requests/minute (240 with API key)", 454 | authentication="Optional OPENFDA_API_KEY for increased rate limits", 455 | ), 456 | ) 457 | 458 | self.register( 459 | "openfda_device_events", 460 | EndpointInfo( 461 | url="https://api.fda.gov/device/event.json", 462 | category=EndpointCategory.REGULATORY_DATA, 463 | data_types=[DataType.DEVICE_EVENTS], 464 | description="FDA MAUDE database for medical device adverse events", 465 | compliance_notes="Public FDA service, device malfunction and adverse event reports", 466 | rate_limit="40 requests/minute (240 with API key)", 467 | authentication="Optional OPENFDA_API_KEY for increased rate limits", 468 | ), 469 | ) 470 | 471 | self.register( 472 | "openfda_drugsfda", 473 | EndpointInfo( 474 | url="https://api.fda.gov/drug/drugsfda.json", 475 | category=EndpointCategory.REGULATORY_DATA, 476 | data_types=[DataType.DRUG_LABELS], 477 | description="FDA Drugs@FDA database for drug approval information", 478 | compliance_notes="Public FDA service, official drug approval records", 479 | rate_limit="40 requests/minute (240 with API key)", 480 | authentication="Optional OPENFDA_API_KEY for increased rate limits", 481 | ), 482 | ) 483 | 484 | self.register( 485 | "openfda_drug_enforcement", 486 | EndpointInfo( 487 | url="https://api.fda.gov/drug/enforcement.json", 488 | category=EndpointCategory.REGULATORY_DATA, 489 | data_types=[DataType.ADVERSE_EVENTS], 490 | description="FDA Enforcement database for drug recall information", 491 | compliance_notes="Public FDA service, drug recall and enforcement actions", 492 | rate_limit="40 requests/minute (240 with API key)", 493 | authentication="Optional OPENFDA_API_KEY for increased rate limits", 494 | ), 495 | ) 496 | 497 | # Note: Drug shortage endpoint is not yet available via OpenFDA 498 | # Using placeholder for future migration when FDA provides official endpoint 499 | self.register( 500 | "fda_drug_shortages", 501 | EndpointInfo( 502 | url="https://www.fda.gov/media/169066/download", 503 | category=EndpointCategory.REGULATORY_DATA, 504 | data_types=[DataType.DRUG_LABELS], 505 | description="FDA Drug Shortages database (cached locally)", 506 | compliance_notes="Public FDA service, drug shortage status information", 507 | rate_limit="Cached with 24-hour TTL", 508 | authentication="None required", 509 | ), 510 | ) 511 | 512 | def register(self, key: str, endpoint: EndpointInfo): 513 | """Register an endpoint for tracking. 514 | 515 | Args: 516 | key: Unique identifier for the endpoint 517 | endpoint: Endpoint metadata including URL, description, and compliance notes 518 | """ 519 | self._endpoints[key] = endpoint 520 | 521 | def get_all_endpoints(self) -> dict[str, EndpointInfo]: 522 | """Get all registered endpoints. 523 | 524 | Returns: 525 | Dictionary mapping endpoint keys to their metadata 526 | """ 527 | return self._endpoints.copy() 528 | 529 | def get_endpoints_by_category( 530 | self, category: EndpointCategory 531 | ) -> dict[str, EndpointInfo]: 532 | """Get endpoints filtered by category. 533 | 534 | Args: 535 | category: The category to filter by 536 | 537 | Returns: 538 | Dictionary of endpoints belonging to the specified category 539 | """ 540 | return { 541 | key: info 542 | for key, info in self._endpoints.items() 543 | if info.category == category 544 | } 545 | 546 | def get_unique_domains(self) -> set[str]: 547 | """Get all unique domains accessed by BioMCP. 548 | 549 | Returns: 550 | Set of unique domain names (e.g., 'api.ncbi.nlm.nih.gov') 551 | """ 552 | return {info.domain for info in self._endpoints.values()} 553 | 554 | def generate_markdown_report(self) -> str: 555 | """Generate markdown documentation of all endpoints.""" 556 | lines = [ 557 | "# Third-Party Endpoints Used by BioMCP", 558 | "", 559 | "_This file is auto-generated from the endpoint registry._", 560 | "", 561 | "## Overview", 562 | "", 563 | f"BioMCP connects to {len(self.get_unique_domains())} external domains across {len(self._endpoints)} endpoints.", 564 | "", 565 | "## Endpoints by Category", 566 | "", 567 | ] 568 | 569 | # Group by category 570 | for category in EndpointCategory: 571 | endpoints = self.get_endpoints_by_category(category) 572 | if not endpoints: 573 | continue 574 | 575 | lines.append(f"### {category.value.replace('_', ' ').title()}") 576 | lines.append("") 577 | 578 | for key, info in sorted(endpoints.items()): 579 | lines.append(f"#### {key}") 580 | lines.append("") 581 | lines.append(f"- **URL**: `{info.url}`") 582 | lines.append(f"- **Description**: {info.description}") 583 | lines.append( 584 | f"- **Data Types**: {', '.join(dt.value for dt in info.data_types)}" 585 | ) 586 | lines.append( 587 | f"- **Rate Limit**: {info.rate_limit or 'Not specified'}" 588 | ) 589 | 590 | if info.authentication: 591 | lines.append( 592 | f"- **Authentication**: {info.authentication}" 593 | ) 594 | 595 | if info.compliance_notes: 596 | lines.append( 597 | f"- **Compliance Notes**: {info.compliance_notes}" 598 | ) 599 | 600 | lines.append("") 601 | 602 | # Add summary section 603 | lines.extend([ 604 | "## Domain Summary", 605 | "", 606 | "| Domain | Category | Endpoints |", 607 | "| -------------------- | --------------------- | --------- |", 608 | ]) 609 | 610 | domain_stats: dict[str, dict[str, Any]] = {} 611 | for info in self._endpoints.values(): 612 | domain = info.domain 613 | if domain not in domain_stats: 614 | domain_stats[domain] = { 615 | "category": info.category.value, 616 | "count": 0, 617 | } 618 | domain_stats[domain]["count"] = ( 619 | int(domain_stats[domain]["count"]) + 1 620 | ) 621 | 622 | for domain, stats in sorted(domain_stats.items()): 623 | lines.append( 624 | f"| {domain} | {stats['category']} | {stats['count']} |" 625 | ) 626 | 627 | lines.extend([ 628 | "", 629 | "## Compliance and Privacy", 630 | "", 631 | "All endpoints accessed by BioMCP:", 632 | "", 633 | "- Use publicly available APIs", 634 | "- Do not transmit personally identifiable information (PII)", 635 | "- Access only aggregate or de-identified data", 636 | "- Comply with respective terms of service", 637 | "", 638 | "## Network Control", 639 | "", 640 | "For air-gapped or restricted environments, BioMCP supports:", 641 | "", 642 | "- Offline mode via `BIOMCP_OFFLINE=true` environment variable", 643 | "- Custom proxy configuration via standard HTTP(S)\\_PROXY variables", 644 | "- SSL certificate pinning for enhanced security", 645 | "", 646 | ]) 647 | 648 | return "\n".join(lines) 649 | 650 | def save_markdown_report(self, output_path: Path | None = None): 651 | """Save markdown report to file.""" 652 | if output_path is None: 653 | output_path = ( 654 | Path(__file__).parent.parent.parent 655 | / "THIRD_PARTY_ENDPOINTS.md" 656 | ) 657 | 658 | output_path.write_text(self.generate_markdown_report()) 659 | return output_path 660 | 661 | 662 | # Global registry instance 663 | _registry = EndpointRegistry() 664 | 665 | 666 | def get_registry() -> EndpointRegistry: 667 | """Get the global endpoint registry.""" 668 | return _registry 669 | ``` -------------------------------------------------------------------------------- /tests/data/ct_gov/clinical_trials_api_v2.yaml: -------------------------------------------------------------------------------- ```yaml 1 | openapi: "3.0.3" 2 | info: 3 | title: "ClinicalTrials.gov REST API" 4 | description: 5 | "This API is made available to provide users meta data, statistics,\ 6 | \ and the most recent version of the clinical trials available on ClinicalTrials.gov." 7 | version: "2.0.3" 8 | tags: 9 | - name: "Studies" 10 | description: "Related to clinical trial studies" 11 | - name: "Stats" 12 | description: "Data statistics" 13 | - name: "Version" 14 | description: "Version info" 15 | servers: 16 | - url: "https://clinicaltrials.gov/api/v2" 17 | description: "This server" 18 | paths: 19 | /studies: 20 | get: 21 | summary: "Studies" 22 | description: 23 | "Returns data of studies matching query and filter parameters.\ 24 | \ The studies are returned page by page.\nIf response contains `nextPageToken`,\ 25 | \ use its value in `pageToken` to get next page.\nThe last page will not contain\ 26 | \ `nextPageToken`. A page may have empty `studies` array.\nRequest for each\ 27 | \ subsequent page **must** have the same parameters as for the first page,\ 28 | \ except\n`countTotal`, `pageSize`, and `pageToken` parameters.\n\nIf neither\ 29 | \ queries nor filters are set, all studies will be returned.\nIf any query\ 30 | \ parameter contains only NCT IDs (comma- and/or space-separated), filters\ 31 | \ are ignored.\n\n`query.*` parameters are in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\n\ 32 | Those parameters affect ranking of studies, if sorted by relevance. See `sort`\ 33 | \ parameter for details.\n\n`filter.*` and `postFilter.*` parameters have\ 34 | \ same effect as there is no aggregation calculation. \nBoth are available\ 35 | \ just to simplify applying parameters from search request.\nBoth do not affect\ 36 | \ ranking of studies.\n\nNote: When trying JSON format in your browser, do\ 37 | \ not set too large `pageSize` parameter, if `fields` is\nunlimited. That\ 38 | \ may return too much data for the browser to parse and render." 39 | tags: 40 | - "Studies" 41 | operationId: "listStudies" 42 | parameters: 43 | - name: "format" 44 | in: "query" 45 | description: 46 | "Must be one of the following:\n* `csv`- return CSV table with\ 47 | \ one page of study data; first page will contain header with column names;\ 48 | \ available fields are listed on [CSV Download](/data-api/about-api/csv-download)\ 49 | \ page\n* `json`- return JSON with one page of study data; every study object\ 50 | \ is placed in a separate line; `markup` type fields format depends on `markupFormat`\ 51 | \ parameter" 52 | required: false 53 | schema: 54 | type: "string" 55 | enum: 56 | - "csv" 57 | - "json" 58 | default: "json" 59 | - name: "markupFormat" 60 | in: "query" 61 | description: 62 | "Format of `markup` type fields:\n* `markdown`- [markdown](https://spec.commonmark.org/0.28/)\ 63 | \ format\n* `legacy`- compatible with classic PRS\n\nApplicable only to\ 64 | \ `json` format." 65 | required: false 66 | schema: 67 | type: "string" 68 | enum: 69 | - "markdown" 70 | - "legacy" 71 | default: "markdown" 72 | - name: "query.cond" 73 | in: "query" 74 | description: 75 | "\"Conditions or disease\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 76 | \ See \"ConditionSearch Area\" on [Search Areas](/data-api/about-api/search-areas#ConditionSearch)\ 77 | \ for more details." 78 | required: false 79 | schema: 80 | type: "string" 81 | examples: 82 | example1: 83 | value: "lung cancer" 84 | example2: 85 | value: "(head OR neck) AND pain" 86 | - name: "query.term" 87 | in: "query" 88 | description: 89 | "\"Other terms\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 90 | \ See \"BasicSearch Area\" on [Search Areas](/data-api/about-api/search-areas#BasicSearch)\ 91 | \ for more details." 92 | required: false 93 | schema: 94 | type: "string" 95 | examples: 96 | example1: 97 | value: "AREA[LastUpdatePostDate]RANGE[2023-01-15,MAX]" 98 | - name: "query.locn" 99 | in: "query" 100 | description: 101 | "\"Location terms\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 102 | \ See \"LocationSearch Area\" on [Search Areas](/data-api/about-api/search-areas#LocationSearch)\ 103 | \ for more details." 104 | required: false 105 | schema: 106 | type: "string" 107 | - name: "query.titles" 108 | in: "query" 109 | description: 110 | "\"Title / acronym\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 111 | \ See \"TitleSearch Area\" on [Search Areas](/data-api/about-api/search-areas#TitleSearch)\ 112 | \ for more details." 113 | required: false 114 | schema: 115 | type: "string" 116 | - name: "query.intr" 117 | in: "query" 118 | description: 119 | "\"Intervention / treatment\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 120 | \ See \"InterventionSearch Area\" on [Search Areas](/data-api/about-api/search-areas#InterventionSearch)\ 121 | \ for more details." 122 | required: false 123 | schema: 124 | type: "string" 125 | - name: "query.outc" 126 | in: "query" 127 | description: 128 | "\"Outcome measure\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 129 | \ See \"OutcomeSearch Area\" on [Search Areas](/data-api/about-api/search-areas#OutcomeSearch)\ 130 | \ for more details." 131 | required: false 132 | schema: 133 | type: "string" 134 | - name: "query.spons" 135 | in: "query" 136 | description: 137 | "\"Sponsor / collaborator\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 138 | \ See \"SponsorSearch Area\" on [Search Areas](/data-api/about-api/search-areas#SponsorSearch)\ 139 | \ for more details." 140 | required: false 141 | schema: 142 | type: "string" 143 | - name: "query.lead" 144 | in: "query" 145 | description: 146 | "Searches in \"LeadSponsorName\" field. See [Study Data Structure](/data-api/about-api/study-data-structure#LeadSponsorName)\ 147 | \ for more details. The query is in [Essie expression syntax](/find-studies/constructing-complex-search-queries)." 148 | required: false 149 | schema: 150 | type: "string" 151 | - name: "query.id" 152 | in: "query" 153 | description: 154 | "\"Study IDs\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\ 155 | \ See \"IdSearch Area\" on [Search Areas](/data-api/about-api/search-areas#IdSearch)\ 156 | \ for more details." 157 | required: false 158 | schema: 159 | type: "string" 160 | - name: "query.patient" 161 | in: "query" 162 | description: 163 | "See \"PatientSearch Area\" on [Search Areas](/data-api/about-api/search-areas#PatientSearch)\ 164 | \ for more details." 165 | required: false 166 | schema: 167 | type: "string" 168 | - name: "filter.overallStatus" 169 | in: "query" 170 | style: "pipeDelimited" 171 | explode: false 172 | description: "Filter by comma- or pipe-separated list of statuses" 173 | required: false 174 | schema: 175 | type: "array" 176 | items: 177 | $ref: "#/components/schemas/Status" 178 | examples: 179 | example1: 180 | value: 181 | - "NOT_YET_RECRUITING" 182 | - "RECRUITING" 183 | example2: 184 | value: 185 | - "COMPLETED" 186 | - name: "filter.geo" 187 | in: "query" 188 | description: 189 | "Filter by geo-function. Currently only distance function is\ 190 | \ supported.\nFormat: `distance(latitude,longitude,distance)`" 191 | required: false 192 | schema: 193 | type: "string" 194 | pattern: 195 | "^distance\\(-?\\d+(\\.\\d+)?,-?\\d+(\\.\\d+)?,\\d+(\\.\\d+)?(km|mi)?\\\ 196 | )$" 197 | examples: 198 | example1: 199 | value: "distance(39.0035707,-77.1013313,50mi)" 200 | - name: "filter.ids" 201 | in: "query" 202 | style: "pipeDelimited" 203 | explode: false 204 | description: 205 | "Filter by comma- or pipe-separated list of NCT IDs (a.k.a. ClinicalTrials.gov\ 206 | \ identifiers).\nThe provided IDs will be searched in [NCTId](data-api/about-api/study-data-structure#NCTId)\ 207 | \ and\n[NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\ 208 | \ fields." 209 | required: false 210 | schema: 211 | type: "array" 212 | items: 213 | type: "string" 214 | pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$" 215 | examples: 216 | example1: 217 | value: 218 | - "NCT04852770" 219 | - "NCT01728545" 220 | - "NCT02109302" 221 | - name: "filter.advanced" 222 | in: "query" 223 | description: "Filter by query in [Essie expression syntax](/find-studies/constructing-complex-search-queries)" 224 | required: false 225 | schema: 226 | type: "string" 227 | examples: 228 | example1: 229 | value: "AREA[StartDate]2022" 230 | example2: 231 | value: 232 | "AREA[MinimumAge]RANGE[MIN, 16 years] AND AREA[MaximumAge]RANGE[16\ 233 | \ years, MAX]" 234 | - name: "filter.synonyms" 235 | in: "query" 236 | style: "pipeDelimited" 237 | explode: false 238 | description: 239 | "Filter by comma- or pipe-separated list of `area`:`synonym_id`\ 240 | \ pairs" 241 | required: false 242 | schema: 243 | type: "array" 244 | items: 245 | type: "string" 246 | examples: 247 | example1: 248 | value: 249 | - "ConditionSearch:1651367" 250 | - "BasicSearch:2013558" 251 | - name: "postFilter.overallStatus" 252 | in: "query" 253 | style: "pipeDelimited" 254 | explode: false 255 | description: "Filter by comma- or pipe-separated list of statuses" 256 | required: false 257 | schema: 258 | type: "array" 259 | items: 260 | $ref: "#/components/schemas/Status" 261 | examples: 262 | example1: 263 | value: 264 | - "NOT_YET_RECRUITING" 265 | - "RECRUITING" 266 | example2: 267 | value: 268 | - "COMPLETED" 269 | - name: "postFilter.geo" 270 | in: "query" 271 | description: 272 | "Filter by geo-function. Currently only distance function is\ 273 | \ supported.\nFormat: `distance(latitude,longitude,distance)`" 274 | required: false 275 | schema: 276 | type: "string" 277 | pattern: 278 | "^distance\\(-?\\d+(\\.\\d+)?,-?\\d+(\\.\\d+)?,\\d+(\\.\\d+)?(km|mi)?\\\ 279 | )$" 280 | examples: 281 | example1: 282 | value: "distance(39.0035707,-77.1013313,50mi)" 283 | - name: "postFilter.ids" 284 | in: "query" 285 | style: "pipeDelimited" 286 | explode: false 287 | description: 288 | "Filter by comma- or pipe-separated list of NCT IDs (a.k.a. ClinicalTrials.gov\ 289 | \ identifiers).\nThe provided IDs will be searched in [NCTId](data-api/about-api/study-data-structure#NCTId)\ 290 | \ and\n[NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\ 291 | \ fields." 292 | required: false 293 | schema: 294 | type: "array" 295 | items: 296 | type: "string" 297 | pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$" 298 | examples: 299 | example1: 300 | value: 301 | - "NCT04852770" 302 | - "NCT01728545" 303 | - "NCT02109302" 304 | - name: "postFilter.advanced" 305 | in: "query" 306 | description: "Filter by query in [Essie expression syntax](/find-studies/constructing-complex-search-queries)" 307 | required: false 308 | schema: 309 | type: "string" 310 | examples: 311 | example1: 312 | value: "AREA[StartDate]2022" 313 | example2: 314 | value: 315 | "AREA[MinimumAge]RANGE[MIN, 16 years] AND AREA[MaximumAge]RANGE[16\ 316 | \ years, MAX]" 317 | - name: "postFilter.synonyms" 318 | in: "query" 319 | style: "pipeDelimited" 320 | explode: false 321 | description: 322 | "Filter by comma- or pipe-separated list of `area`:`synonym_id`\ 323 | \ pairs" 324 | required: false 325 | schema: 326 | type: "array" 327 | items: 328 | type: "string" 329 | examples: 330 | example1: 331 | value: 332 | - "ConditionSearch:1651367" 333 | - "BasicSearch:2013558" 334 | - name: "aggFilters" 335 | in: "query" 336 | description: 337 | "Apply aggregation filters, aggregation counts will not be provided.\n\ 338 | The value is comma- or pipe-separated list of pairs `filter_id`:`space-separated\ 339 | \ list of option keys` for the checked options." 340 | required: false 341 | schema: 342 | type: "string" 343 | examples: 344 | example1: 345 | value: "results:with,status:com" 346 | example2: 347 | value: "status:not rec,sex:f,healthy:y" 348 | - name: "geoDecay" 349 | in: "query" 350 | description: 351 | "Set proximity factor by distance from `filter.geo` location\ 352 | \ to the closest [LocationGeoPoint](/data-api/about-api/study-data-structure#LocationGeoPoint)\ 353 | \ of a study.\nIgnored, if `filter.geo` parameter is not set or response\ 354 | \ contains more than 10,000 studies." 355 | required: false 356 | schema: 357 | type: "string" 358 | pattern: 359 | "^func:(gauss|exp|linear),scale:(\\d+(\\.\\d+)?(km|mi)),offset:(\\\ 360 | d+(\\.\\d+)?(km|mi)),decay:(\\d+(\\.\\d+)?)$" 361 | default: "func:exp,scale:300mi,offset:0mi,decay:0.5" 362 | examples: 363 | example1: 364 | value: "func:linear,scale:100km,offset:10km,decay:0.1" 365 | example2: 366 | value: "func:gauss,scale:500mi,offset:0mi,decay:0.3" 367 | - name: "fields" 368 | in: "query" 369 | style: "pipeDelimited" 370 | explode: false 371 | description: 372 | "If specified, must be non-empty comma- or pipe-separated list\ 373 | \ of fields to return. If unspecified, all fields will be returned.\nOrder\ 374 | \ of the fields does not matter.\n\nFor `csv` format, specify list of columns.\ 375 | \ The column names are available on [CSV Download](/data-api/about-api/csv-download).\n\ 376 | \nFor `json` format, every list item is either area name, piece name, field\ 377 | \ name, or special name.\nIf a piece or a field is a branch node, all descendant\ 378 | \ fields will be included.\nAll area names are available on [Search Areas](/data-api/about-api/search-areas),\n\ 379 | the piece and field names — on [Data Structure](/data-api/about-api/study-data-structure)\ 380 | \ and also can be retrieved at `/studies/metadata` endpoint.\nThere is a\ 381 | \ special name, `@query`, which expands to all fields queried by search." 382 | required: false 383 | schema: 384 | type: "array" 385 | minItems: 1 386 | items: 387 | type: "string" 388 | pattern: "^([a-zA-Z][a-zA-Z0-9\\-. ]*)|(@query)$" 389 | examples: 390 | example1: 391 | value: 392 | - "NCTId" 393 | - "BriefTitle" 394 | - "OverallStatus" 395 | - "HasResults" 396 | example2: 397 | value: "ProtocolSection" 398 | - name: "sort" 399 | in: "query" 400 | style: "pipeDelimited" 401 | explode: false 402 | description: 403 | "Comma- or pipe-separated list of sorting options of the studies.\ 404 | \ The returning studies are not sorted by default for a performance reason.\n\ 405 | Every list item contains a field/piece name and an optional sort direction\ 406 | \ (`asc` for ascending or `desc` for descending)\nafter colon character.\n\ 407 | \nAll piece and field names can be found on [Data Structure](/data-api/about-api/study-data-structure)\ 408 | \ and also can be retrieved\nat `/studies/metadata` endpoint. Currently,\ 409 | \ only date and numeric fields are allowed for sorting.\nThere is a special\ 410 | \ \"field\" `@relevance` to sort by relevance to a search query.\n\nStudies\ 411 | \ missing sort field are always last. Default sort direction:\n* Date field\ 412 | \ - `desc`\n* Numeric field - `asc`\n* `@relevance` - `desc`" 413 | required: false 414 | schema: 415 | type: "array" 416 | maxItems: 2 417 | default: [] 418 | items: 419 | type: "string" 420 | pattern: "^(([a-zA-Z][a-zA-Z0-9\\-. ]*)|(@relevance))(:(asc|desc))?$" 421 | examples: 422 | example1: 423 | value: 424 | - "@relevance" 425 | example2: 426 | value: 427 | - "LastUpdatePostDate" 428 | example3: 429 | value: 430 | - "EnrollmentCount:desc" 431 | - "NumArmGroups" 432 | - name: "countTotal" 433 | in: "query" 434 | description: 435 | "Count total number of studies in all pages and return `totalCount`\ 436 | \ field with first page, if `true`.\nFor CSV, the result can be found in\ 437 | \ `x-total-count` response header.\nThe parameter is ignored for the subsequent\ 438 | \ pages." 439 | required: false 440 | schema: 441 | type: "boolean" 442 | default: false 443 | - name: "pageSize" 444 | in: "query" 445 | description: 446 | "Page size is maximum number of studies to return in response.\ 447 | \ It does not have to be the same for every page.\nIf not specified or set\ 448 | \ to 0, the default value will be used. It will be coerced down to 1,000,\ 449 | \ if greater than that." 450 | required: false 451 | schema: 452 | type: "integer" 453 | format: "int32" 454 | minimum: 0 455 | default: 10 456 | examples: 457 | example1: 458 | value: 2 459 | example2: 460 | value: 100 461 | - name: "pageToken" 462 | in: "query" 463 | description: 464 | "Token to get next page. Set it to a `nextPageToken` value returned\ 465 | \ with the previous page in JSON format.\nFor CSV, it can be found in `x-next-page-token`\ 466 | \ response header.\nDo not specify it for first page." 467 | required: false 468 | schema: 469 | type: "string" 470 | responses: 471 | "200": 472 | description: "OK" 473 | content: 474 | application/json: 475 | schema: 476 | $ref: "#/components/schemas/PagedStudies" 477 | example: 478 | totalCount: 438897 479 | studies: 480 | - protocolSection: 481 | identificationModule: 482 | nctId: "NCT03540771" 483 | briefTitle: 484 | "Introducing Palliative Care (PC) Within the Treatment\ 485 | \ of End Stage Liver Disease (ESLD)" 486 | statusModule: 487 | overallStatus: "RECRUITING" 488 | hasResults: false 489 | - protocolSection: 490 | identificationModule: 491 | nctId: "NCT03630471" 492 | briefTitle: 493 | "Effectiveness of a Problem-solving Intervention\ 494 | \ for Common Adolescent Mental Health Problems in India" 495 | statusModule: 496 | overallStatus: "COMPLETED" 497 | hasResults: false 498 | - protocolSection: 499 | identificationModule: 500 | nctId: "NCT00587795" 501 | briefTitle: 502 | "Orthopedic Study of the Aircast StabilAir Wrist\ 503 | \ Fracture Brace" 504 | statusModule: 505 | overallStatus: "TERMINATED" 506 | hasResults: true 507 | nextPageToken: "abracadabra" 508 | "400": 509 | description: "Bad Request" 510 | content: 511 | text/plain: 512 | schema: 513 | $ref: "#/components/schemas/errorMessage" 514 | /studies/{nctId}: 515 | get: 516 | summary: "Single Study" 517 | description: "Returns data of a single study." 518 | tags: 519 | - "Studies" 520 | operationId: "fetchStudy" 521 | parameters: 522 | - name: "nctId" 523 | in: "path" 524 | description: 525 | "NCT Number of a study. If found in [NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\ 526 | \ field,\n301 HTTP redirect to the actual study will be returned." 527 | required: true 528 | schema: 529 | type: "string" 530 | pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$" 531 | examples: 532 | example1: 533 | value: "NCT00841061" 534 | example2: 535 | value: "NCT04000165" 536 | - name: "format" 537 | in: "query" 538 | description: 539 | "Must be one of the following:\n* `csv`- return CSV table; available\ 540 | \ fields are listed on [CSV Download](/data-api/about-api/csv-download)\n\ 541 | * `json`- return JSON object; format of `markup` fields depends on `markupFormat`\ 542 | \ parameter\n* `json.zip`- put JSON object into a .json file and download\ 543 | \ it as zip archive; field values of type `markup` are in [markdown](https://spec.commonmark.org/0.28/)\ 544 | \ format\n* `fhir.json` - return FHIR JSON; fields are not customizable;\ 545 | \ see [Access Data in FHIR](/data-api/fhir)\n* `ris`- return RIS record;\ 546 | \ available tags are listed on [RIS Download](/data-api/about-api/ris-download)" 547 | required: false 548 | schema: 549 | type: "string" 550 | enum: 551 | - "csv" 552 | - "json" 553 | - "json.zip" 554 | - "fhir.json" 555 | - "ris" 556 | default: "json" 557 | - name: "markupFormat" 558 | in: "query" 559 | description: 560 | "Format of `markup` type fields:\n* `markdown`- [markdown](https://spec.commonmark.org/0.28/)\ 561 | \ format\n* `legacy`- compatible with classic PRS\n\nApplicable only to\ 562 | \ `json` format." 563 | required: false 564 | schema: 565 | type: "string" 566 | enum: 567 | - "markdown" 568 | - "legacy" 569 | default: "markdown" 570 | - name: "fields" 571 | in: "query" 572 | style: "pipeDelimited" 573 | explode: false 574 | description: 575 | "If specified, must be non-empty comma- or pipe-separated list\ 576 | \ of fields to return. If unspecified, all fields will be returned.\nOrder\ 577 | \ of the fields does not matter.\n\nFor `csv` format, specify list of columns.\ 578 | \ The column names are available on [CSV Download](/data-api/about-api/csv-download).\n\ 579 | \nFor `json` and `json.zip` formats, every list item is either area name,\ 580 | \ piece name, or field name.\nIf a piece or a field is a branch node, all\ 581 | \ descendant fields will be included.\nAll area names are available on [Search\ 582 | \ Areas](/data-api/about-api/search-areas),\nthe piece and field names -\ 583 | \ on [Data Structure](/data-api/about-api/study-data-structure) and also\ 584 | \ can be retrieved at `/studies/metadata` endpoint.\n\nFor `fhir.json` format,\ 585 | \ all available fields are returned and this parameter must be unspecified.\n\ 586 | \nFor `ris` format, specify list of tags. The tag names are available on\ 587 | \ [RIS Download](/data-api/about-api/ris-download)." 588 | required: false 589 | schema: 590 | type: "array" 591 | minItems: 1 592 | items: 593 | type: "string" 594 | pattern: "^[a-zA-Z][a-zA-Z0-9\\-. ]*$" 595 | examples: 596 | example1: 597 | value: 598 | - "NCTId" 599 | - "BriefTitle" 600 | - "Reference" 601 | example2: 602 | value: 603 | - "ConditionsModule" 604 | - "EligibilityModule" 605 | responses: 606 | "200": 607 | description: "OK" 608 | content: 609 | text/csv: 610 | schema: 611 | $ref: "#/components/schemas/StudiesCsv" 612 | application/json: 613 | schema: 614 | $ref: "#/components/schemas/Study" 615 | application/zip: 616 | schema: 617 | $ref: "#/components/schemas/StudiesZip" 618 | application/fhir+json: 619 | schema: 620 | $ref: "#/components/schemas/StudyFhir" 621 | "301": 622 | description: "Moved Permanently" 623 | content: {} 624 | "400": 625 | description: "Bad Request" 626 | content: 627 | text/plain: 628 | schema: 629 | $ref: "#/components/schemas/errorMessage" 630 | "404": 631 | description: "Not Found" 632 | content: 633 | text/plain: 634 | schema: 635 | $ref: "#/components/schemas/errorMessage" 636 | /studies/metadata: 637 | get: 638 | summary: "Data Model Fields" 639 | description: "Returns study data model fields." 640 | tags: 641 | - "Studies" 642 | operationId: "studiesMetadata" 643 | parameters: 644 | - name: "includeIndexedOnly" 645 | in: "query" 646 | description: "Include indexed-only fields, if `true`" 647 | required: false 648 | schema: 649 | type: "boolean" 650 | default: false 651 | - name: "includeHistoricOnly" 652 | in: "query" 653 | description: "Include fields available only in historic data, if `true`" 654 | required: false 655 | schema: 656 | type: "boolean" 657 | default: false 658 | responses: 659 | "200": 660 | description: "OK" 661 | content: 662 | application/json: 663 | schema: 664 | $ref: "#/components/schemas/FieldNodeList" 665 | "400": 666 | description: "Bad Request" 667 | content: 668 | text/plain: 669 | schema: 670 | $ref: "#/components/schemas/errorMessage" 671 | ```