This is page 7 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /src/biomcp/articles/search.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import json 3 | from collections.abc import Generator 4 | from typing import Annotated, Any, get_args 5 | 6 | from pydantic import BaseModel, Field, computed_field 7 | 8 | from .. import http_client, render 9 | from ..constants import PUBTATOR3_SEARCH_URL, SYSTEM_PAGE_SIZE 10 | from ..core import PublicationState 11 | from .autocomplete import Concept, EntityRequest, autocomplete 12 | from .fetch import call_pubtator_api 13 | 14 | concepts: list[Concept] = sorted(get_args(Concept)) 15 | fields: list[str] = [concept + "s" for concept in concepts] 16 | 17 | 18 | class PubmedRequest(BaseModel): 19 | chemicals: list[str] = Field( 20 | default_factory=list, 21 | description="List of chemicals for filtering results.", 22 | ) 23 | diseases: list[str] = Field( 24 | default_factory=list, 25 | description="Diseases such as Hypertension, Lung Adenocarcinoma, etc.", 26 | ) 27 | genes: list[str] = Field( 28 | default_factory=list, 29 | description="List of genes for filtering results.", 30 | ) 31 | keywords: list[str] = Field( 32 | default_factory=list, 33 | description="List of other keywords for filtering results.", 34 | ) 35 | variants: list[str] = Field( 36 | default_factory=list, 37 | description="List of variants for filtering results.", 38 | ) 39 | 40 | def iter_concepts(self) -> Generator[tuple[Concept, str], None, None]: 41 | for concept in concepts: 42 | field = concept + "s" 43 | values = getattr(self, field, []) or [] 44 | for value in values: 45 | yield concept, value 46 | 47 | 48 | class PubtatorRequest(BaseModel): 49 | text: str 50 | size: int = 50 51 | 52 | 53 | class ResultItem(BaseModel): 54 | pmid: int | None = None 55 | pmcid: str | None = None 56 | title: str | None = None 57 | journal: str | None = None 58 | authors: list[str] | None = None 59 | date: str | None = None 60 | doi: str | None = None 61 | abstract: str | None = None 62 | publication_state: PublicationState = PublicationState.PEER_REVIEWED 63 | source: str | None = Field( 64 | None, description="Source database (e.g., PubMed, bioRxiv, Europe PMC)" 65 | ) 66 | 67 | @computed_field 68 | def pubmed_url(self) -> str | None: 69 | url = None 70 | if self.pmid: 71 | url = f"https://pubmed.ncbi.nlm.nih.gov/{self.pmid}/" 72 | return url 73 | 74 | @computed_field 75 | def pmc_url(self) -> str | None: 76 | """Generates the PMC URL if PMCID exists.""" 77 | url = None 78 | if self.pmcid: 79 | url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{self.pmcid}/" 80 | return url 81 | 82 | @computed_field 83 | def doi_url(self) -> str | None: 84 | """Generates the DOI URL if DOI exists.""" 85 | url = None 86 | if self.doi: 87 | url = f"https://doi.org/{self.doi}" 88 | return url 89 | 90 | 91 | class SearchResponse(BaseModel): 92 | results: list[ResultItem] 93 | page_size: int 94 | current: int 95 | count: int 96 | total_pages: int 97 | 98 | 99 | async def convert_request(request: PubmedRequest) -> PubtatorRequest: 100 | query_parts = [] 101 | 102 | # Process keywords with OR logic support 103 | for keyword in request.keywords: 104 | if "|" in keyword: 105 | # Handle OR within a keyword (e.g., "R173|Arg173|p.R173") 106 | or_terms = [term.strip() for term in keyword.split("|")] 107 | or_query = "(" + " OR ".join(or_terms) + ")" 108 | query_parts.append(or_query) 109 | else: 110 | query_parts.append(keyword) 111 | 112 | # Create all autocomplete tasks in parallel 113 | autocomplete_tasks = [] 114 | concept_values = [] 115 | 116 | for concept, value in request.iter_concepts(): 117 | task = autocomplete( 118 | request=EntityRequest(concept=concept, query=value), 119 | ) 120 | autocomplete_tasks.append(task) 121 | concept_values.append((concept, value)) 122 | 123 | # Execute all autocomplete calls in parallel 124 | if autocomplete_tasks: 125 | entities = await asyncio.gather(*autocomplete_tasks) 126 | 127 | # Process results 128 | for (_concept, value), entity in zip( 129 | concept_values, entities, strict=False 130 | ): 131 | if entity: 132 | query_parts.append(entity.entity_id) 133 | else: 134 | query_parts.append(value) 135 | 136 | query_text = " AND ".join(query_parts) 137 | 138 | return PubtatorRequest(text=query_text, size=SYSTEM_PAGE_SIZE) 139 | 140 | 141 | async def add_abstracts(response: SearchResponse) -> None: 142 | pmids = [pr.pmid for pr in response.results if pr.pmid] 143 | abstract_response, _ = await call_pubtator_api(pmids, full=False) 144 | 145 | if abstract_response: 146 | for result in response.results: 147 | result.abstract = abstract_response.get_abstract(result.pmid) 148 | 149 | 150 | def clean_authors(record): 151 | """Keep only the first and last author if > 4 authors.""" 152 | authors = record.get("authors") 153 | if authors and len(authors) > 4: 154 | record["authors"] = [authors[0], "...", authors[-1]] 155 | return record 156 | 157 | 158 | async def search_articles( 159 | request: PubmedRequest, 160 | output_json: bool = False, 161 | ) -> str: 162 | pubtator_request = await convert_request(request) 163 | 164 | # Start the search request 165 | search_task = http_client.request_api( 166 | url=PUBTATOR3_SEARCH_URL, 167 | request=pubtator_request, 168 | response_model_type=SearchResponse, 169 | domain="article", 170 | ) 171 | 172 | # Execute search first 173 | response, error = await search_task 174 | 175 | if response: 176 | # Now fetch abstracts (still sequential but could be parallelized with other operations) 177 | await add_abstracts(response) 178 | # Add source field to PubMed results 179 | for result in response.results: 180 | result.source = "PubMed" 181 | 182 | # noinspection DuplicatedCode 183 | if error: 184 | data: list[dict[str, Any]] = [ 185 | {"error": f"Error {error.code}: {error.message}"} 186 | ] 187 | else: 188 | data = list( 189 | map( 190 | clean_authors, 191 | [ 192 | result.model_dump(mode="json", exclude_none=True) 193 | for result in (response.results if response else []) 194 | ], 195 | ) 196 | ) 197 | 198 | if data and not output_json: 199 | return render.to_markdown(data) 200 | else: 201 | return json.dumps(data, indent=2) 202 | 203 | 204 | async def _article_searcher( 205 | call_benefit: Annotated[ 206 | str, 207 | "Define and summarize why this function is being called and the intended benefit", 208 | ], 209 | chemicals: Annotated[ 210 | list[str] | str | None, "List of chemicals for filtering results" 211 | ] = None, 212 | diseases: Annotated[ 213 | list[str] | str | None, 214 | "Diseases such as Hypertension, Lung Adenocarcinoma, etc.", 215 | ] = None, 216 | genes: Annotated[ 217 | list[str] | str | None, "List of genes for filtering results" 218 | ] = None, 219 | keywords: Annotated[ 220 | list[str] | str | None, "List of other keywords for filtering results" 221 | ] = None, 222 | variants: Annotated[ 223 | list[str] | str | None, "List of variants for filtering results" 224 | ] = None, 225 | include_preprints: Annotated[ 226 | bool, "Include preprint articles from bioRxiv/medRxiv and Europe PMC" 227 | ] = True, 228 | include_cbioportal: Annotated[ 229 | bool, 230 | "Include cBioPortal cancer genomics summary when searching by gene", 231 | ] = True, 232 | ) -> str: 233 | """ 234 | Searches for articles across PubMed and preprint servers. 235 | 236 | Parameters: 237 | - call_benefit: Define and summarize why this function is being called and the intended benefit 238 | - chemicals: List of chemicals for filtering results 239 | - diseases: Diseases such as Hypertension, Lung Adenocarcinoma, etc. 240 | - genes: List of genes for filtering results 241 | - keywords: List of other keywords for filtering results 242 | - variants: List of variants for filtering results 243 | - include_preprints: Include results from preprint servers (default: True) 244 | - include_cbioportal: Include cBioPortal summaries for gene searches (default: True) 245 | 246 | Notes: 247 | - Use full terms ("Non-small cell lung carcinoma") over abbreviations ("NSCLC") 248 | - Use keywords to specify terms that don't fit in disease, gene ("EGFR"), 249 | chemical ("Cisplatin"), or variant ("BRAF V600E") categories 250 | - Parameters can be provided as lists or comma-separated strings 251 | - Results include both peer-reviewed and preprint articles by default 252 | - Keywords support OR logic using the pipe (|) separator: 253 | - Example: "R173|Arg173|p.R173" finds articles with any of these notations 254 | - Multiple keywords are still combined with AND logic 255 | 256 | Returns: 257 | Markdown formatted list of matching articles, sorted by date (newest first), 258 | with peer-reviewed articles listed before preprints. 259 | Limited to max 20 results (10 from each source) by default to optimize token usage. 260 | """ 261 | # Import here to avoid circular dependency 262 | from .search_optimized import article_searcher_optimized 263 | 264 | # Use the optimized version with caching 265 | return await article_searcher_optimized( 266 | call_benefit=call_benefit, 267 | chemicals=chemicals, 268 | diseases=diseases, 269 | genes=genes, 270 | keywords=keywords, 271 | variants=variants, 272 | include_preprints=include_preprints, 273 | include_cbioportal=include_cbioportal, 274 | ) 275 | ``` -------------------------------------------------------------------------------- /docs/FDA_SECURITY.md: -------------------------------------------------------------------------------- ```markdown 1 | # FDA Integration Security Documentation 2 | 3 | ## Overview 4 | 5 | This document outlines the security measures implemented in the BioMCP FDA integration to ensure safe handling of medical data and protection against common vulnerabilities. 6 | 7 | ## Security Features 8 | 9 | ### 1. Input Validation & Sanitization 10 | 11 | All user inputs are validated and sanitized before being sent to the FDA API: 12 | 13 | - **Injection Prevention**: Removes characters that could be used for SQL injection, XSS, or command injection (`<>\"';&|\\`) 14 | - **Length Limits**: Enforces maximum lengths on all input fields 15 | - **Type Validation**: Ensures parameters match expected types (dates, numbers, etc.) 16 | - **Format Validation**: Validates specific formats (e.g., YYYY-MM-DD for dates) 17 | 18 | **Implementation**: `src/biomcp/openfda/input_validation.py` 19 | 20 | ```python 21 | # Example usage 22 | from biomcp.openfda.input_validation import sanitize_input, validate_drug_name 23 | 24 | safe_drug = validate_drug_name("Aspirin<script>") # Returns "Aspirin" 25 | safe_input = sanitize_input("'; DROP TABLE;") # SQL injection blocked 26 | ``` 27 | 28 | ### 2. API Key Protection 29 | 30 | API keys are protected at multiple levels: 31 | 32 | - **Cache Key Exclusion**: API keys are removed before generating cache keys 33 | - **No Logging**: API keys are never logged, even in debug mode 34 | - **Environment Variables**: Keys stored in environment variables, not in code 35 | - **Validation**: API key format is validated before use 36 | 37 | **Implementation**: `src/biomcp/openfda/cache.py`, `src/biomcp/openfda/utils.py` 38 | 39 | ### 3. Rate Limiting 40 | 41 | Client-side rate limiting prevents API quota exhaustion: 42 | 43 | - **Token Bucket Algorithm**: Allows bursts while maintaining average rate 44 | - **Configurable Limits**: 40 requests/minute without key, 240 with key 45 | - **Concurrent Request Limiting**: Maximum 10 concurrent requests via semaphore 46 | - **Automatic Backoff**: Delays requests when approaching limits 47 | 48 | **Implementation**: `src/biomcp/openfda/rate_limiter.py` 49 | 50 | ### 4. Circuit Breaker Pattern 51 | 52 | Prevents cascading failures when FDA API is unavailable: 53 | 54 | - **Failure Threshold**: Opens after 5 consecutive failures 55 | - **Recovery Timeout**: Waits 60 seconds before retry attempts 56 | - **Half-Open State**: Tests recovery with limited requests 57 | - **Automatic Recovery**: Returns to normal operation when API recovers 58 | 59 | **States**: 60 | 61 | - **CLOSED**: Normal operation 62 | - **OPEN**: Blocking all requests (API is down) 63 | - **HALF_OPEN**: Testing if API has recovered 64 | 65 | ### 5. Memory Protection 66 | 67 | Prevents memory exhaustion from large responses: 68 | 69 | - **Response Size Limits**: Maximum 1MB per cached response 70 | - **Cache Size Limits**: Maximum 100 entries in cache 71 | - **FIFO Eviction**: Oldest entries removed when cache is full 72 | - **Size Validation**: Large responses rejected before caching 73 | 74 | **Configuration**: 75 | 76 | ```bash 77 | export BIOMCP_FDA_MAX_RESPONSE_SIZE=1048576 # 1MB 78 | export BIOMCP_FDA_MAX_CACHE_SIZE=100 79 | ``` 80 | 81 | ### 6. File Operation Security 82 | 83 | Secure handling of cache files: 84 | 85 | - **File Locking**: Uses `fcntl` for exclusive/shared locks 86 | - **Atomic Operations**: Writes to temp files then renames 87 | - **Race Condition Prevention**: Locks prevent concurrent modifications 88 | - **Permission Control**: Files created without world-write permissions 89 | 90 | **Implementation**: `src/biomcp/openfda/drug_shortages.py` 91 | 92 | ## Security Best Practices 93 | 94 | ### For Developers 95 | 96 | 1. **Never Log Sensitive Data** 97 | 98 | ```python 99 | # BAD 100 | logger.debug(f"API key: {api_key}") 101 | 102 | # GOOD 103 | logger.debug("API key configured" if api_key else "No API key") 104 | ``` 105 | 106 | 2. **Always Validate Input** 107 | 108 | ```python 109 | from biomcp.openfda.input_validation import validate_drug_name 110 | 111 | # Always validate before using 112 | safe_drug = validate_drug_name(user_input) 113 | if safe_drug: 114 | # Use safe_drug, not user_input 115 | await search_adverse_events(drug=safe_drug) 116 | ``` 117 | 118 | 3. **Use Rate Limiting** 119 | 120 | ```python 121 | from biomcp.openfda.rate_limiter import rate_limited_request 122 | 123 | # Wrap API calls with rate limiting 124 | result = await rate_limited_request(make_api_call, params) 125 | ``` 126 | 127 | ### For System Administrators 128 | 129 | 1. **API Key Management** 130 | 131 | - Store API keys in environment variables 132 | - Rotate keys regularly (recommended: every 90 days) 133 | - Use different keys for dev/staging/production 134 | - Monitor key usage for anomalies 135 | 136 | 2. **Monitoring** 137 | 138 | - Set up alerts for circuit breaker state changes 139 | - Monitor rate limit consumption 140 | - Track cache hit/miss ratios 141 | - Log validation failures (potential attacks) 142 | 143 | 3. **Resource Limits** 144 | ```bash 145 | # Configure limits based on your environment 146 | export BIOMCP_FDA_CACHE_TTL=15 # Minutes 147 | export BIOMCP_FDA_MAX_CACHE_SIZE=100 148 | export BIOMCP_FDA_MAX_RESPONSE_SIZE=1048576 # 1MB 149 | ``` 150 | 151 | ## Threat Model 152 | 153 | ### Threats Addressed 154 | 155 | | Threat | Mitigation | Implementation | 156 | | ------------------- | --------------------------- | ---------------------- | 157 | | SQL Injection | Input sanitization | `input_validation.py` | 158 | | XSS Attacks | HTML/JS character removal | `sanitize_input()` | 159 | | Command Injection | Shell metacharacter removal | `sanitize_input()` | 160 | | API Key Exposure | Exclusion from logs/cache | `cache.py`, `utils.py` | 161 | | DoS via Rate Limits | Client-side rate limiting | `rate_limiter.py` | 162 | | Cascading Failures | Circuit breaker pattern | `CircuitBreaker` class | 163 | | Memory Exhaustion | Response size limits | `MAX_RESPONSE_SIZE` | 164 | | Race Conditions | File locking | `fcntl` usage | 165 | | Cache Poisoning | Input validation | `build_safe_query()` | 166 | 167 | ### Residual Risks 168 | 169 | 1. **API Key Compromise**: If environment is compromised, keys are accessible 170 | 171 | - **Mitigation**: Use secret management systems in production 172 | 173 | 2. **Zero-Day FDA API Vulnerabilities**: Unknown vulnerabilities in FDA API 174 | 175 | - **Mitigation**: Monitor FDA security advisories 176 | 177 | 3. **Distributed DoS**: Multiple clients could still overwhelm FDA API 178 | - **Mitigation**: Implement global rate limiting at gateway level 179 | 180 | ## Compliance Considerations 181 | 182 | ### HIPAA (If Applicable) 183 | 184 | While FDA's public APIs don't contain PHI, if extended to include patient data: 185 | 186 | 1. **Encryption**: Use TLS for all API communications 187 | 2. **Audit Logging**: Log all data access (but not the data itself) 188 | 3. **Access Controls**: Implement user authentication/authorization 189 | 4. **Data Retention**: Define and enforce retention policies 190 | 191 | ### FDA Data Usage 192 | 193 | 1. **Attribution**: Always include FDA disclaimers in responses 194 | 2. **Data Currency**: Warn users that data may not be real-time 195 | 3. **Medical Decisions**: Explicitly state data is not for clinical decisions 196 | 4. **Rate Limits**: Respect FDA's terms of service 197 | 198 | ## Security Testing 199 | 200 | ### Automated Tests 201 | 202 | Run security tests with: 203 | 204 | ```bash 205 | pytest tests/tdd/openfda/test_security.py -v 206 | ``` 207 | 208 | Tests cover: 209 | 210 | - Input validation 211 | - Cache key security 212 | - Rate limiting 213 | - Circuit breaker 214 | - File operations 215 | 216 | ### Manual Security Review 217 | 218 | Checklist for security review: 219 | 220 | - [ ] No sensitive data in logs 221 | - [ ] All inputs validated 222 | - [ ] Rate limiting functional 223 | - [ ] Circuit breaker triggers correctly 224 | - [ ] Cache size limited 225 | - [ ] File operations are atomic 226 | - [ ] API keys not in cache keys 227 | - [ ] Error messages don't leak information 228 | 229 | ## Incident Response 230 | 231 | ### If API Key is Compromised 232 | 233 | 1. **Immediate**: Revoke compromised key at FDA portal 234 | 2. **Generate**: Create new API key 235 | 3. **Update**: Update environment variables 236 | 4. **Restart**: Restart services to load new key 237 | 5. **Audit**: Review logs for unauthorized usage 238 | 239 | ### If Rate Limits Exceeded 240 | 241 | 1. **Check**: Verify circuit breaker state 242 | 2. **Wait**: Allow circuit breaker recovery timeout 243 | 3. **Reduce**: Lower request rate if needed 244 | 4. **Monitor**: Check for abnormal usage patterns 245 | 246 | ### If Security Vulnerability Found 247 | 248 | 1. **Assess**: Determine severity and exploitability 249 | 2. **Patch**: Develop and test fix 250 | 3. **Deploy**: Roll out fix with monitoring 251 | 4. **Document**: Update this security documentation 252 | 5. **Notify**: Inform users if data was at risk 253 | 254 | ## Configuration Reference 255 | 256 | ### Environment Variables 257 | 258 | | Variable | Default | Description | 259 | | ------------------------------ | ------- | ---------------------------------- | 260 | | `OPENFDA_API_KEY` | None | FDA API key for higher rate limits | 261 | | `BIOMCP_FDA_CACHE_TTL` | 15 | Cache TTL in minutes | 262 | | `BIOMCP_FDA_MAX_CACHE_SIZE` | 100 | Maximum cache entries | 263 | | `BIOMCP_FDA_MAX_RESPONSE_SIZE` | 1048576 | Maximum response size in bytes | 264 | | `BIOMCP_SHORTAGE_CACHE_TTL` | 24 | Drug shortage cache TTL in hours | 265 | 266 | ### Security Headers 267 | 268 | When deploying as a web service, add these headers: 269 | 270 | ```python 271 | headers = { 272 | "X-Content-Type-Options": "nosniff", 273 | "X-Frame-Options": "DENY", 274 | "X-XSS-Protection": "1; mode=block", 275 | "Strict-Transport-Security": "max-age=31536000; includeSubDomains", 276 | "Content-Security-Policy": "default-src 'self'" 277 | } 278 | ``` 279 | 280 | ## Contact 281 | 282 | For security issues, contact: [email protected] (create this address) 283 | 284 | For FDA API issues, see: https://open.fda.gov/apis/ 285 | 286 | --- 287 | 288 | _Last Updated: 2025-08-07_ 289 | _Version: 1.0_ 290 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_cbioportal_search.py: -------------------------------------------------------------------------------- ```python 1 | """Test cBioPortal search enhancements.""" 2 | 3 | import asyncio 4 | 5 | import pytest 6 | 7 | from biomcp.variants.cbioportal_search import ( 8 | CBioPortalSearchClient, 9 | CBioPortalSearchSummary, 10 | format_cbioportal_search_summary, 11 | ) 12 | from biomcp.variants.search import VariantQuery, search_variants 13 | 14 | from .constants import API_RETRY_DELAY_SECONDS, DEFAULT_MAX_STUDIES 15 | 16 | 17 | class TestCBioPortalSearch: 18 | """Test cBioPortal search functionality.""" 19 | 20 | @pytest.mark.asyncio 21 | @pytest.mark.integration 22 | async def test_gene_search_summary(self): 23 | """Test getting gene search summary from cBioPortal.""" 24 | client = CBioPortalSearchClient() 25 | 26 | # Test with BRAF 27 | summary = await client.get_gene_search_summary("BRAF", max_studies=5) 28 | 29 | assert summary is not None 30 | assert summary.gene == "BRAF" 31 | 32 | # Handle case where cBioPortal API returns empty data 33 | if summary.total_mutations == 0: 34 | # API might be down or returning empty results 35 | # This is acceptable for integration tests 36 | assert summary.total_mutations == 0 37 | assert summary.total_samples_tested == 0 38 | assert summary.mutation_frequency == 0.0 39 | assert len(summary.hotspots) == 0 40 | else: 41 | # Normal case - data is available 42 | assert summary.total_mutations > 0 43 | assert summary.total_samples_tested > 0 44 | assert summary.mutation_frequency > 0 45 | assert len(summary.hotspots) > 0 46 | 47 | # Check that V600E is a top hotspot 48 | v600e_found = any( 49 | "V600E" in hs.amino_acid_change for hs in summary.hotspots 50 | ) 51 | assert v600e_found, "BRAF V600E should be a top hotspot" 52 | 53 | # Check cancer distribution 54 | if summary.total_mutations > 0: 55 | assert len(summary.cancer_distribution) > 0 56 | assert any( 57 | "melanoma" in cancer.lower() 58 | for cancer in summary.cancer_distribution 59 | ), "BRAF should be found in melanoma" 60 | else: 61 | # When no mutations found, cancer distribution should be empty 62 | assert len(summary.cancer_distribution) == 0 63 | 64 | @pytest.mark.asyncio 65 | @pytest.mark.integration 66 | async def test_format_search_summary(self): 67 | """Test formatting of search summary.""" 68 | # Create a mock summary 69 | summary = CBioPortalSearchSummary( 70 | gene="BRAF", 71 | total_mutations=1000, 72 | total_samples_tested=10000, 73 | mutation_frequency=0.1, 74 | hotspots=[ 75 | { 76 | "position": 600, 77 | "amino_acid_change": "V600E", 78 | "count": 800, 79 | "frequency": 0.8, 80 | "cancer_types": ["Melanoma", "Colorectal Cancer"], 81 | } 82 | ], 83 | cancer_distribution={"Melanoma": 600, "Colorectal Cancer": 200}, 84 | study_coverage={ 85 | "total_studies": 50, 86 | "queried_studies": 10, 87 | "studies_with_data": 8, 88 | }, 89 | ) 90 | 91 | formatted = format_cbioportal_search_summary(summary) 92 | 93 | assert "BRAF" in formatted 94 | assert "10.0%" in formatted # Mutation frequency 95 | assert "V600E" in formatted 96 | assert "Melanoma" in formatted 97 | assert "600 mutations" in formatted 98 | 99 | @pytest.mark.asyncio 100 | @pytest.mark.integration 101 | async def test_search_with_cbioportal_summary(self): 102 | """Test variant search with cBioPortal summary included.""" 103 | query = VariantQuery(gene="BRAF", size=5) 104 | 105 | result = await search_variants(query, include_cbioportal=True) 106 | 107 | # Should include cBioPortal summary section 108 | assert "cBioPortal Summary for BRAF" in result 109 | assert "Mutation Frequency" in result 110 | # Top Hotspots only appears when mutations are found 111 | # Check for either Top Hotspots or 0 mutations message 112 | assert "Top Hotspots" in result or "0 mutations" in result 113 | 114 | # Should still include variant results 115 | assert "# Record" in result or "No variants found" in result 116 | 117 | @pytest.mark.asyncio 118 | @pytest.mark.integration 119 | async def test_search_without_gene(self): 120 | """Test that cBioPortal summary is not included without gene parameter.""" 121 | query = VariantQuery(rsid="rs113488022", size=5) 122 | 123 | result = await search_variants(query, include_cbioportal=True) 124 | 125 | # Should not include cBioPortal summary 126 | assert "cBioPortal Summary" not in result 127 | 128 | @pytest.mark.asyncio 129 | @pytest.mark.integration 130 | async def test_tp53_search_summary(self): 131 | """Test TP53 gene search summary.""" 132 | client = CBioPortalSearchClient() 133 | 134 | # Clear any caches to ensure fresh data 135 | from biomcp.utils.request_cache import clear_cache 136 | 137 | await clear_cache() 138 | 139 | summary = await client.get_gene_search_summary("TP53", max_studies=5) 140 | 141 | assert summary is not None 142 | assert summary.gene == "TP53" 143 | 144 | # If we got no mutations, it might be a temporary API issue 145 | if summary.total_mutations == 0 and summary.total_samples_tested == 0: 146 | # Try one more time with a small delay 147 | await asyncio.sleep(API_RETRY_DELAY_SECONDS) 148 | summary = await client.get_gene_search_summary( 149 | "TP53", max_studies=5 150 | ) 151 | 152 | # If still no data, skip the test rather than fail 153 | if summary.total_mutations == 0: 154 | pytest.skip( 155 | "cBioPortal returned no mutation data for TP53 - possible API issue" 156 | ) 157 | 158 | # Basic checks that should pass when data is available 159 | assert ( 160 | summary.total_mutations > 0 161 | ), f"TP53 should have mutations. Got: {summary}" 162 | 163 | # More flexible checks 164 | if summary.hotspots: 165 | # Just verify structure if we have hotspots 166 | hotspot_changes = [hs.amino_acid_change for hs in summary.hotspots] 167 | print(f"TP53 hotspots found: {hotspot_changes[:5]}") 168 | assert ( 169 | len(hotspot_changes) >= 1 170 | ), "Should find at least one TP53 hotspot" 171 | 172 | @pytest.mark.asyncio 173 | @pytest.mark.integration 174 | async def test_kras_search_summary(self): 175 | """Test KRAS gene search summary. 176 | 177 | This test verifies basic functionality rather than specific hotspots, 178 | which can change as cBioPortal data is updated. 179 | """ 180 | client = CBioPortalSearchClient() 181 | 182 | # Clear any caches to ensure fresh data 183 | from biomcp.utils.request_cache import clear_cache 184 | 185 | await clear_cache() 186 | 187 | summary = await client.get_gene_search_summary( 188 | "KRAS", max_studies=DEFAULT_MAX_STUDIES 189 | ) 190 | 191 | assert summary is not None, "Failed to get summary for KRAS" 192 | assert summary.gene == "KRAS" 193 | 194 | # If we got no mutations, it might be a temporary API issue 195 | if summary.total_mutations == 0 and summary.total_samples_tested == 0: 196 | # Try one more time with a small delay 197 | await asyncio.sleep(API_RETRY_DELAY_SECONDS) 198 | summary = await client.get_gene_search_summary( 199 | "KRAS", max_studies=DEFAULT_MAX_STUDIES 200 | ) 201 | 202 | # If still no data, skip the test rather than fail 203 | if summary.total_mutations == 0: 204 | pytest.skip( 205 | "cBioPortal returned no mutation data for KRAS - possible API issue" 206 | ) 207 | 208 | # Basic checks that should pass when data is available 209 | assert ( 210 | summary.total_mutations > 0 211 | ), f"KRAS should have mutations. Got: {summary}" 212 | 213 | # More flexible checks 214 | if summary.hotspots: 215 | # Just verify structure if we have hotspots 216 | for hotspot in summary.hotspots[:3]: 217 | assert hasattr(hotspot, "amino_acid_change") 218 | assert hasattr(hotspot, "count") 219 | print( 220 | f"Top KRAS hotspots: {[hs.amino_acid_change for hs in summary.hotspots[:5]]}" 221 | ) 222 | 223 | # Cancer distribution check - only if we have data 224 | if summary.total_mutations > 0: 225 | assert ( 226 | len(summary.cancer_distribution) > 0 227 | ), "Should have cancer type distribution" 228 | 229 | @pytest.mark.asyncio 230 | @pytest.mark.integration 231 | async def test_invalid_gene(self): 232 | """Test handling of invalid gene name.""" 233 | client = CBioPortalSearchClient() 234 | 235 | summary = await client.get_gene_search_summary("INVALID_GENE") 236 | 237 | assert summary is None 238 | 239 | @pytest.mark.asyncio 240 | @pytest.mark.integration 241 | async def test_json_output_with_cbioportal(self): 242 | """Test JSON output includes cBioPortal summary.""" 243 | query = VariantQuery(gene="BRAF", size=2) 244 | 245 | result = await search_variants( 246 | query, output_json=True, include_cbioportal=True 247 | ) 248 | 249 | # Parse JSON 250 | import json 251 | 252 | data = json.loads(result) 253 | 254 | # Should have both summary and variants 255 | assert "cbioportal_summary" in data 256 | assert "variants" in data 257 | assert "BRAF" in data["cbioportal_summary"] 258 | ``` -------------------------------------------------------------------------------- /tests/tdd/articles/test_unified.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for unified article search functionality.""" 2 | 3 | import json 4 | from unittest.mock import AsyncMock, patch 5 | 6 | import pytest 7 | 8 | from biomcp.articles.search import PubmedRequest 9 | from biomcp.articles.unified import ( 10 | _deduplicate_articles, 11 | _parse_search_results, 12 | search_articles_unified, 13 | ) 14 | 15 | 16 | class TestUnifiedSearch: 17 | """Test unified search functionality.""" 18 | 19 | @pytest.fixture 20 | def pubmed_results(self): 21 | """Sample PubMed results in JSON format.""" 22 | return json.dumps([ 23 | { 24 | "pmid": 12345, 25 | "title": "BRAF mutations in cancer", 26 | "doi": "10.1234/test1", 27 | "date": "2024-01-15", 28 | "publication_state": "peer_reviewed", 29 | }, 30 | { 31 | "pmid": 12346, 32 | "title": "Another cancer study", 33 | "doi": "10.1234/test2", 34 | "date": "2024-01-10", 35 | "publication_state": "peer_reviewed", 36 | }, 37 | ]) 38 | 39 | @pytest.fixture 40 | def preprint_results(self): 41 | """Sample preprint results in JSON format.""" 42 | return json.dumps([ 43 | { 44 | "title": "BRAF preprint study", 45 | "doi": "10.1101/2024.01.20.123456", 46 | "date": "2024-01-20", 47 | "publication_state": "preprint", 48 | "source": "bioRxiv", 49 | }, 50 | { 51 | "title": "Duplicate study", 52 | "doi": "10.1234/test1", # Same DOI as PubMed result 53 | "date": "2024-01-14", 54 | "publication_state": "preprint", 55 | "source": "Europe PMC", 56 | }, 57 | ]) 58 | 59 | @pytest.mark.asyncio 60 | async def test_search_articles_unified_both_sources( 61 | self, pubmed_results, preprint_results 62 | ): 63 | """Test searching with both PubMed and preprints enabled.""" 64 | request = PubmedRequest(genes=["BRAF"]) 65 | 66 | mock_pubmed = AsyncMock(return_value=pubmed_results) 67 | mock_preprints = AsyncMock(return_value=preprint_results) 68 | 69 | with ( 70 | patch("biomcp.articles.unified.search_articles", mock_pubmed), 71 | patch("biomcp.articles.unified.search_preprints", mock_preprints), 72 | patch( 73 | "biomcp.variants.cbioportal_search.CBioPortalSearchClient" 74 | ) as mock_cbio, 75 | ): 76 | # Mock cBioPortal client to return None (no summary) 77 | mock_cbio.return_value.get_gene_search_summary = AsyncMock( 78 | return_value=None 79 | ) 80 | 81 | result = await search_articles_unified( 82 | request, 83 | include_pubmed=True, 84 | include_preprints=True, 85 | output_json=True, 86 | ) 87 | 88 | # Parse result 89 | data = json.loads(result) 90 | 91 | # When gene is specified but cBioPortal returns no data, 92 | # we should just get the articles list 93 | if isinstance(data, dict): 94 | articles = data.get("articles", data) 95 | else: 96 | articles = data 97 | 98 | # Should have 3 articles (one duplicate removed) 99 | assert len(articles) == 3 100 | 101 | # Check ordering - peer reviewed should come first 102 | # Sort is by (publication_state priority, date DESC) 103 | # The test data has preprint with newer date, so it might come first 104 | # Let's just check we have the right mix 105 | states = [a["publication_state"] for a in articles] 106 | assert states.count("peer_reviewed") == 2 107 | assert states.count("preprint") == 1 108 | 109 | # Check deduplication worked 110 | dois = [a.get("doi") for a in articles if a.get("doi")] 111 | assert len(dois) == len(set(dois)) # No duplicate DOIs 112 | 113 | @pytest.mark.asyncio 114 | async def test_search_articles_unified_pubmed_only(self, pubmed_results): 115 | """Test searching with only PubMed enabled.""" 116 | request = PubmedRequest( 117 | keywords=["cancer"] 118 | ) # No gene, so no cBioPortal 119 | 120 | with ( 121 | patch("biomcp.articles.unified.search_articles") as mock_pubmed, 122 | patch( 123 | "biomcp.articles.unified.search_preprints" 124 | ) as mock_preprints, 125 | ): 126 | mock_pubmed.return_value = pubmed_results 127 | 128 | result = await search_articles_unified( 129 | request, 130 | include_pubmed=True, 131 | include_preprints=False, 132 | output_json=True, 133 | ) 134 | 135 | # Preprints should not be called 136 | mock_preprints.assert_not_called() 137 | 138 | # Parse result 139 | articles = json.loads(result) 140 | assert len(articles) == 2 141 | assert all( 142 | a["publication_state"] == "peer_reviewed" for a in articles 143 | ) 144 | 145 | @pytest.mark.asyncio 146 | async def test_search_articles_unified_preprints_only( 147 | self, preprint_results 148 | ): 149 | """Test searching with only preprints enabled.""" 150 | request = PubmedRequest( 151 | keywords=["cancer"] 152 | ) # No gene, so no cBioPortal 153 | 154 | with ( 155 | patch("biomcp.articles.unified.search_articles") as mock_pubmed, 156 | patch( 157 | "biomcp.articles.unified.search_preprints" 158 | ) as mock_preprints, 159 | ): 160 | mock_preprints.return_value = preprint_results 161 | 162 | result = await search_articles_unified( 163 | request, 164 | include_pubmed=False, 165 | include_preprints=True, 166 | output_json=True, 167 | ) 168 | 169 | # PubMed should not be called 170 | mock_pubmed.assert_not_called() 171 | 172 | # Parse result 173 | articles = json.loads(result) 174 | assert len(articles) == 2 175 | assert all(a["publication_state"] == "preprint" for a in articles) 176 | 177 | @pytest.mark.asyncio 178 | async def test_search_articles_unified_error_handling(self): 179 | """Test error handling when one source fails.""" 180 | request = PubmedRequest( 181 | keywords=["cancer"] 182 | ) # No gene, so no cBioPortal 183 | 184 | with ( 185 | patch("biomcp.articles.unified.search_articles") as mock_pubmed, 186 | patch( 187 | "biomcp.articles.unified.search_preprints" 188 | ) as mock_preprints, 189 | ): 190 | # PubMed succeeds 191 | mock_pubmed.return_value = json.dumps([{"title": "Success"}]) 192 | # Preprints fails 193 | mock_preprints.side_effect = Exception("API Error") 194 | 195 | result = await search_articles_unified( 196 | request, 197 | include_pubmed=True, 198 | include_preprints=True, 199 | output_json=True, 200 | ) 201 | 202 | # Should still get PubMed results 203 | articles = json.loads(result) 204 | assert len(articles) == 1 205 | assert articles[0]["title"] == "Success" 206 | 207 | @pytest.mark.asyncio 208 | async def test_search_articles_unified_markdown_output( 209 | self, pubmed_results 210 | ): 211 | """Test markdown output format.""" 212 | request = PubmedRequest(genes=["BRAF"]) 213 | 214 | mock_pubmed = AsyncMock(return_value=pubmed_results) 215 | 216 | with patch("biomcp.articles.unified.search_articles", mock_pubmed): 217 | result = await search_articles_unified( 218 | request, 219 | include_pubmed=True, 220 | include_preprints=False, 221 | output_json=False, 222 | ) 223 | 224 | # Should return markdown 225 | assert isinstance(result, str) 226 | assert "BRAF mutations in cancer" in result 227 | assert "# Record" in result # Markdown headers 228 | 229 | def test_deduplicate_articles(self): 230 | """Test article deduplication logic.""" 231 | articles = [ 232 | {"title": "Article 1", "doi": "10.1234/test1"}, 233 | {"title": "Article 2", "doi": "10.1234/test2"}, 234 | {"title": "Duplicate of 1", "doi": "10.1234/test1"}, 235 | {"title": "No DOI article"}, 236 | {"title": "Another no DOI"}, 237 | ] 238 | 239 | deduped = _deduplicate_articles(articles) 240 | 241 | # Should have 4 articles (one duplicate removed) 242 | assert len(deduped) == 4 243 | 244 | # Check DOIs are unique 245 | dois = [a.get("doi") for a in deduped if a.get("doi")] 246 | assert len(dois) == len(set(dois)) 247 | 248 | # Articles without DOI should be preserved 249 | no_doi_count = sum(1 for a in deduped if not a.get("doi")) 250 | assert no_doi_count == 2 251 | 252 | def test_parse_search_results(self): 253 | """Test parsing of search results from multiple sources.""" 254 | results = [ 255 | json.dumps([{"title": "Article 1"}, {"title": "Article 2"}]), 256 | json.dumps([{"title": "Article 3"}]), 257 | Exception("Failed source"), # Should be skipped 258 | "[invalid json", # Should be skipped 259 | ] 260 | 261 | parsed = _parse_search_results(results) 262 | 263 | # Should have 3 articles (2 + 1, skipping errors) 264 | assert len(parsed) == 3 265 | assert parsed[0]["title"] == "Article 1" 266 | assert parsed[1]["title"] == "Article 2" 267 | assert parsed[2]["title"] == "Article 3" 268 | 269 | def test_parse_search_results_empty(self): 270 | """Test parsing with all empty/failed results.""" 271 | results = [ 272 | Exception("Failed"), 273 | "[invalid", 274 | json.dumps([]), # Empty list 275 | ] 276 | 277 | parsed = _parse_search_results(results) 278 | assert parsed == [] 279 | ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/device_events.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | OpenFDA Device Adverse Events (MAUDE) integration. 3 | 4 | Focus on genomic/diagnostic devices relevant to precision oncology. 5 | """ 6 | 7 | import logging 8 | 9 | from .constants import ( 10 | GENOMIC_DEVICE_PRODUCT_CODES, 11 | OPENFDA_DEFAULT_LIMIT, 12 | OPENFDA_DEVICE_EVENTS_URL, 13 | OPENFDA_DISCLAIMER, 14 | OPENFDA_MAX_LIMIT, 15 | ) 16 | from .device_events_helpers import ( 17 | analyze_device_problems, 18 | format_detailed_device_info, 19 | format_device_detail_header, 20 | format_device_distribution, 21 | format_device_report_summary, 22 | format_patient_details, 23 | format_top_problems, 24 | ) 25 | from .utils import clean_text, format_count, make_openfda_request 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def _build_device_search_query( 31 | device: str | None, 32 | manufacturer: str | None, 33 | problem: str | None, 34 | product_code: str | None, 35 | genomics_only: bool, 36 | ) -> str: 37 | """Build the search query for device events.""" 38 | search_parts = [] 39 | 40 | if device: 41 | # Build flexible search queries 42 | device_queries = [] 43 | 44 | # First try exact match 45 | device_queries.extend([ 46 | f'device.brand_name:"{device}"', 47 | f'device.generic_name:"{device}"', 48 | f'device.openfda.device_name:"{device}"', 49 | ]) 50 | 51 | # For multi-word terms, also search for key words with wildcards 52 | # This helps match "FoundationOne CDx" to "F1CDX" or similar variations 53 | words = device.split() 54 | 55 | # If it's a multi-word query, add wildcard searches for significant words 56 | for word in words: 57 | # Skip common words and very short ones 58 | if len(word) > 3 and word.lower() not in [ 59 | "test", 60 | "system", 61 | "device", 62 | ]: 63 | # Use prefix wildcard for better performance 64 | device_queries.append(f"device.brand_name:{word}*") 65 | device_queries.append(f"device.generic_name:{word}*") 66 | 67 | # Also try searching by removing spaces (e.g., "Foundation One" -> "FoundationOne") 68 | if len(words) > 1: 69 | combined = "".join(words) 70 | device_queries.append(f'device.brand_name:"{combined}"') 71 | device_queries.append(f'device.generic_name:"{combined}"') 72 | 73 | search_parts.append(f"({' OR '.join(device_queries)})") 74 | 75 | if manufacturer: 76 | # Search manufacturer field with both exact and wildcard matching 77 | mfr_queries = [ 78 | f'device.manufacturer_d_name:"{manufacturer}"', 79 | f"device.manufacturer_d_name:*{manufacturer}*", 80 | ] 81 | search_parts.append(f"({' OR '.join(mfr_queries)})") 82 | 83 | if problem: 84 | search_parts.append(f'device.device_problem_text:"{problem}"') 85 | 86 | if product_code: 87 | search_parts.append(f'device.openfda.product_code:"{product_code}"') 88 | elif ( 89 | genomics_only and not device 90 | ): # Only apply genomics filter if no specific device is named 91 | # Filter to genomic device product codes 92 | code_parts = [ 93 | f'device.openfda.product_code:"{code}"' 94 | for code in GENOMIC_DEVICE_PRODUCT_CODES 95 | ] 96 | if code_parts: 97 | search_parts.append(f"({' OR '.join(code_parts)})") 98 | 99 | return " AND ".join(search_parts) 100 | 101 | 102 | def _format_search_summary( 103 | device: str | None, 104 | manufacturer: str | None, 105 | problem: str | None, 106 | genomics_only: bool, 107 | total: int, 108 | ) -> list[str]: 109 | """Format the search summary section.""" 110 | output = [] 111 | 112 | search_desc = [] 113 | if device: 114 | search_desc.append(f"**Device**: {device}") 115 | if manufacturer: 116 | search_desc.append(f"**Manufacturer**: {manufacturer}") 117 | if problem: 118 | search_desc.append(f"**Problem**: {problem}") 119 | if genomics_only: 120 | search_desc.append("**Type**: Genomic/Diagnostic Devices") 121 | 122 | if search_desc: 123 | output.append(" | ".join(search_desc)) 124 | output.append( 125 | f"**Total Reports Found**: {format_count(total, 'report')}\n" 126 | ) 127 | 128 | return output 129 | 130 | 131 | async def search_device_events( 132 | device: str | None = None, 133 | manufacturer: str | None = None, 134 | problem: str | None = None, 135 | product_code: str | None = None, 136 | genomics_only: bool = True, 137 | limit: int = OPENFDA_DEFAULT_LIMIT, 138 | skip: int = 0, 139 | api_key: str | None = None, 140 | ) -> str: 141 | """ 142 | Search FDA device adverse event reports (MAUDE). 143 | 144 | Args: 145 | device: Device name to search for 146 | manufacturer: Manufacturer name 147 | problem: Device problem description 148 | product_code: FDA product code 149 | genomics_only: Filter to genomic/diagnostic devices only 150 | limit: Maximum number of results 151 | skip: Number of results to skip 152 | api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) 153 | 154 | Returns: 155 | Formatted string with device event information 156 | """ 157 | if not device and not manufacturer and not product_code and not problem: 158 | return ( 159 | "⚠️ Please specify a device name, manufacturer, or problem to search.\n\n" 160 | "Examples:\n" 161 | "- Search by device: --device 'FoundationOne'\n" 162 | "- Search by manufacturer: --manufacturer 'Illumina'\n" 163 | "- Search by problem: --problem 'false positive'" 164 | ) 165 | 166 | # Build and execute search 167 | search_query = _build_device_search_query( 168 | device, manufacturer, problem, product_code, genomics_only 169 | ) 170 | params = { 171 | "search": search_query, 172 | "limit": min(limit, OPENFDA_MAX_LIMIT), 173 | "skip": skip, 174 | } 175 | 176 | response, error = await make_openfda_request( 177 | OPENFDA_DEVICE_EVENTS_URL, params, "openfda_device_events", api_key 178 | ) 179 | 180 | if error: 181 | return f"⚠️ Error searching device events: {error}" 182 | 183 | if not response or not response.get("results"): 184 | return _format_no_results(device, manufacturer, problem, genomics_only) 185 | 186 | results = response["results"] 187 | total = ( 188 | response.get("meta", {}).get("results", {}).get("total", len(results)) 189 | ) 190 | 191 | # Build output 192 | output = ["## FDA Device Adverse Event Reports\n"] 193 | output.extend( 194 | _format_search_summary( 195 | device, manufacturer, problem, genomics_only, total 196 | ) 197 | ) 198 | 199 | # Analyze and format problems 200 | all_problems, all_device_names, _ = analyze_device_problems(results) 201 | output.extend(format_top_problems(all_problems, results)) 202 | 203 | # Show device distribution if searching by problem 204 | if problem: 205 | output.extend(format_device_distribution(all_device_names, results)) 206 | 207 | # Display sample reports 208 | output.append( 209 | f"### Sample Reports (showing {min(len(results), 3)} of {total}):\n" 210 | ) 211 | for i, result in enumerate(results[:3], 1): 212 | output.extend(format_device_report_summary(result, i)) 213 | 214 | # Add tips 215 | if genomics_only: 216 | output.append( 217 | "\n💡 **Note**: Results filtered to genomic/diagnostic devices. " 218 | "Use --no-genomics-only to search all medical devices." 219 | ) 220 | 221 | output.append(f"\n{OPENFDA_DISCLAIMER}") 222 | return "\n".join(output) 223 | 224 | 225 | def _format_no_results( 226 | device: str | None, 227 | manufacturer: str | None, 228 | problem: str | None, 229 | genomics_only: bool, 230 | ) -> str: 231 | """Format no results message.""" 232 | search_desc = [] 233 | if device: 234 | search_desc.append(f"device '{device}'") 235 | if manufacturer: 236 | search_desc.append(f"manufacturer '{manufacturer}'") 237 | if problem: 238 | search_desc.append(f"problem '{problem}'") 239 | 240 | desc = " and ".join(search_desc) 241 | if genomics_only: 242 | desc += " (filtered to genomic/diagnostic devices)" 243 | 244 | return f"No device adverse event reports found for {desc}." 245 | 246 | 247 | async def get_device_event( 248 | mdr_report_key: str, api_key: str | None = None 249 | ) -> str: 250 | """ 251 | Get detailed information for a specific device event report. 252 | 253 | Args: 254 | mdr_report_key: MDR report key 255 | api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) 256 | 257 | Returns: 258 | Formatted string with detailed report information 259 | """ 260 | params = { 261 | "search": f'mdr_report_key:"{mdr_report_key}"', 262 | "limit": 1, 263 | } 264 | 265 | response, error = await make_openfda_request( 266 | OPENFDA_DEVICE_EVENTS_URL, 267 | params, 268 | "openfda_device_event_detail", 269 | api_key, 270 | ) 271 | 272 | if error: 273 | return f"⚠️ Error retrieving device event report: {error}" 274 | 275 | if not response or not response.get("results"): 276 | return f"Device event report '{mdr_report_key}' not found." 277 | 278 | result = response["results"][0] 279 | 280 | # Build detailed output 281 | output = format_device_detail_header(result, mdr_report_key) 282 | 283 | # Device details 284 | if devices := result.get("device", []): 285 | output.extend(format_detailed_device_info(devices)) 286 | 287 | # Event narrative 288 | if event_desc := result.get("event_description"): 289 | output.append("### Event Description") 290 | output.append(clean_text(event_desc)) 291 | output.append("") 292 | 293 | # Manufacturer narrative 294 | if mfr_narrative := result.get("manufacturer_narrative"): 295 | output.append("### Manufacturer's Analysis") 296 | output.append(clean_text(mfr_narrative)) 297 | output.append("") 298 | 299 | # Patient information 300 | if patient := result.get("patient", []): 301 | output.extend(format_patient_details(patient)) 302 | 303 | # Remedial action 304 | if remedial := result.get("remedial_action"): 305 | output.append("### Remedial Action") 306 | if isinstance(remedial, list): 307 | output.append(", ".join(remedial)) 308 | else: 309 | output.append(remedial) 310 | output.append("") 311 | 312 | output.append(f"\n{OPENFDA_DISCLAIMER}") 313 | return "\n".join(output) 314 | ``` -------------------------------------------------------------------------------- /docs/troubleshooting.md: -------------------------------------------------------------------------------- ```markdown 1 | # Troubleshooting Guide 2 | 3 | This guide helps you resolve common issues with BioMCP installation, configuration, and usage. 4 | 5 | ## Installation Issues 6 | 7 | ### Prerequisites Not Met 8 | 9 | **macOS:** 10 | 11 | ```bash 12 | # Install uv (recommended) 13 | brew install uv 14 | 15 | # Or using the official installer 16 | curl -LsSf https://astral.sh/uv/install.sh | sh 17 | 18 | # Install Node.js for npx (if needed) 19 | brew install node 20 | ``` 21 | 22 | **Linux:** 23 | 24 | ```bash 25 | # Install uv 26 | curl -LsSf https://astral.sh/uv/install.sh | sh 27 | 28 | # Install Node.js 29 | curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - 30 | sudo apt-get install -y nodejs 31 | ``` 32 | 33 | **Windows:** 34 | 35 | ```powershell 36 | # Install uv 37 | powershell -c "irm https://astral.sh/uv/install.ps1 | iex" 38 | 39 | # Install Node.js from https://nodejs.org 40 | ``` 41 | 42 | ### "Command not found" Error 43 | 44 | After installing BioMCP, if you get "command not found": 45 | 46 | 1. **Restart your terminal** - PATH updates require a new session 47 | 48 | 2. **Check installation location:** 49 | 50 | ```bash 51 | # For uv tool install 52 | ls ~/.local/bin/biomcp 53 | 54 | # For pip install 55 | which biomcp 56 | ``` 57 | 58 | 3. **Add to PATH manually:** 59 | 60 | ```bash 61 | # Add to ~/.bashrc or ~/.zshrc 62 | export PATH="$HOME/.local/bin:$PATH" 63 | ``` 64 | 65 | 4. **Reinstall with force:** 66 | 67 | ```bash 68 | uv tool install biomcp --force 69 | ``` 70 | 71 | 5. **Use full path:** 72 | ```bash 73 | ~/.local/bin/biomcp --version 74 | ``` 75 | 76 | ### Python Version Issues 77 | 78 | BioMCP requires Python 3.10 or higher: 79 | 80 | ```bash 81 | # Check Python version 82 | python --version 83 | 84 | # If too old, install newer version 85 | # macOS 86 | brew install [email protected] 87 | 88 | # Linux 89 | sudo apt update 90 | sudo apt install python3.11 91 | 92 | # Use pyenv for version management 93 | pyenv install 3.11.8 94 | pyenv local 3.11.8 95 | ``` 96 | 97 | ## Configuration Issues 98 | 99 | ### API Key Not Working 100 | 101 | **Environment Variable Not Set:** 102 | 103 | ```bash 104 | # Check if set 105 | echo $NCI_API_KEY 106 | 107 | # Set temporarily 108 | export NCI_API_KEY="your-key-here" 109 | 110 | # Set permanently in ~/.bashrc or ~/.zshrc 111 | echo 'export NCI_API_KEY="your-key-here"' >> ~/.bashrc 112 | source ~/.bashrc 113 | ``` 114 | 115 | **Wrong API Key Format:** 116 | 117 | - NCI keys: Should be 36 characters (UUID format) 118 | - AlphaGenome: Alphanumeric string 119 | - cBioPortal: JWT token format 120 | 121 | **API Key Permissions:** 122 | 123 | ```bash 124 | # Test NCI API key 125 | biomcp health check --verbose 126 | 127 | # Test specific API 128 | curl -H "X-API-KEY: $NCI_API_KEY" \ 129 | "https://cts.nlm.nih.gov/api/v2/trials?size=1" 130 | ``` 131 | 132 | ### SSL Certificate Errors 133 | 134 | **Update certificates:** 135 | 136 | ```bash 137 | # Python certificates 138 | pip install --upgrade certifi 139 | 140 | # System certificates (macOS) 141 | brew install ca-certificates 142 | 143 | # System certificates (Linux) 144 | sudo apt-get update 145 | sudo apt-get install ca-certificates 146 | ``` 147 | 148 | **Corporate proxy issues:** 149 | 150 | ```bash 151 | # Set proxy environment variables 152 | export HTTP_PROXY="http://proxy.company.com:8080" 153 | export HTTPS_PROXY="http://proxy.company.com:8080" 154 | export NO_PROXY="localhost,127.0.0.1" 155 | 156 | # Configure pip for proxy 157 | pip config set global.proxy http://proxy.company.com:8080 158 | ``` 159 | 160 | ## Search Issues 161 | 162 | ### No Results Found 163 | 164 | **1. Check gene symbol:** 165 | 166 | ```bash 167 | # Wrong: common names 168 | biomcp article search --gene HER2 # ❌ 169 | 170 | # Correct: official HGNC symbol 171 | biomcp article search --gene ERBB2 # ✅ 172 | 173 | # Find correct symbol 174 | biomcp gene get HER2 # Will suggest ERBB2 175 | ``` 176 | 177 | **2. Too restrictive filters:** 178 | 179 | ```bash 180 | # Too specific - may return nothing 181 | biomcp article search --gene BRAF --disease "stage IV melanoma" \ 182 | --chemical "dabrafenib and trametinib combination" 183 | 184 | # Better - broader search 185 | biomcp article search --gene BRAF --disease melanoma \ 186 | --keyword "dabrafenib trametinib" 187 | ``` 188 | 189 | **3. Check data availability:** 190 | 191 | ```bash 192 | # Test if gene exists in database 193 | biomcp gene get YOUR_GENE 194 | 195 | # Test if disease term is recognized 196 | biomcp disease get "your disease term" 197 | ``` 198 | 199 | ### Location Search Not Working 200 | 201 | Location searches require coordinates: 202 | 203 | ```bash 204 | # Wrong - city name only 205 | biomcp trial search --condition cancer --city "New York" # ❌ 206 | 207 | # Correct - with coordinates 208 | biomcp trial search --condition cancer \ 209 | --latitude 40.7128 --longitude -74.0060 --distance 50 # ✅ 210 | ``` 211 | 212 | Common coordinates: 213 | 214 | - New York: 40.7128, -74.0060 215 | - Los Angeles: 34.0522, -118.2437 216 | - Chicago: 41.8781, -87.6298 217 | - Houston: 29.7604, -95.3698 218 | - Boston: 42.3601, -71.0589 219 | 220 | ### Preprint Search Issues 221 | 222 | **Preprints not appearing:** 223 | 224 | ```bash 225 | # Check if preprints are being excluded 226 | biomcp article search --gene BRAF --no-preprints # Excludes preprints 227 | 228 | # Include preprints (default) 229 | biomcp article search --gene BRAF # Includes preprints 230 | ``` 231 | 232 | **DOI not found:** 233 | 234 | ```bash 235 | # Ensure correct DOI format 236 | biomcp article get "10.1101/2024.01.20.23288905" # bioRxiv format 237 | 238 | # Not all preprints are indexed immediately 239 | # Try searching by title/keywords instead 240 | ``` 241 | 242 | ## Performance Issues 243 | 244 | ### Slow Searches 245 | 246 | **1. Reduce result count:** 247 | 248 | ```bash 249 | # Default may be too high 250 | biomcp article search --gene TP53 --limit 100 # Slow 251 | 252 | # Reduce for faster results 253 | biomcp article search --gene TP53 --limit 10 # Fast 254 | ``` 255 | 256 | **2. Use specific filters:** 257 | 258 | ```bash 259 | # Broad search - slow 260 | biomcp trial search --condition cancer 261 | 262 | # Specific search - faster 263 | biomcp trial search --condition "melanoma" --phase PHASE3 \ 264 | --status RECRUITING --country "United States" 265 | ``` 266 | 267 | **3. Check API health:** 268 | 269 | ```bash 270 | # See which APIs are slow 271 | biomcp health check --verbose 272 | 273 | # Check specific API 274 | biomcp health check --apis-only 275 | ``` 276 | 277 | ### Timeout Errors 278 | 279 | **Increase timeout for slow networks:** 280 | 281 | ```bash 282 | # Set environment variable 283 | export BIOMCP_TIMEOUT=300 # 5 minutes 284 | 285 | # Or use configuration file 286 | echo "timeout: 300" > ~/.biomcp/config.yml 287 | ``` 288 | 289 | **For specific operations:** 290 | 291 | ```python 292 | # In Python scripts 293 | import asyncio 294 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 295 | ``` 296 | 297 | ### Memory Issues 298 | 299 | **Large result sets:** 300 | 301 | ```bash 302 | # Process in batches 303 | for i in {1..10}; do 304 | biomcp article search --gene BRCA1 --page $i --limit 100 305 | done 306 | 307 | # Use streaming where available 308 | biomcp article search --gene TP53 --format jsonl | \ 309 | while read line; do 310 | echo "$line" | jq '.pmid' 311 | done 312 | ``` 313 | 314 | ## MCP Server Issues 315 | 316 | ### Testing Server Connectivity 317 | 318 | **1. Test with MCP Inspector:** 319 | 320 | ```bash 321 | npx @modelcontextprotocol/inspector uv run --with biomcp-python biomcp run 322 | ``` 323 | 324 | Open http://127.0.0.1:6274 and verify: 325 | 326 | - Tools list loads 327 | - Can invoke a simple tool like `gene_getter` 328 | 329 | **2. Test with curl (HTTP mode):** 330 | 331 | ```bash 332 | # Start server in HTTP mode 333 | biomcp run --mode http --port 8000 334 | 335 | # Test health endpoint 336 | curl http://localhost:8000/health 337 | 338 | # Test MCP endpoint 339 | curl -X POST http://localhost:8000/mcp \ 340 | -H "Content-Type: application/json" \ 341 | -d '{"method": "tools/list"}' 342 | ``` 343 | 344 | ### Claude Desktop Integration Issues 345 | 346 | **Server not appearing:** 347 | 348 | 1. Check configuration file location: 349 | 350 | - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json` 351 | - Windows: `%APPDATA%\Claude\claude_desktop_config.json` 352 | 353 | 2. Validate JSON syntax: 354 | 355 | ```bash 356 | # macOS 357 | cat ~/Library/Application\ Support/Claude/claude_desktop_config.json | jq . 358 | ``` 359 | 360 | 3. Check server starts correctly: 361 | ```bash 362 | # Test the exact command from config 363 | uv run --with biomcp-python biomcp run 364 | ``` 365 | 366 | **Server crashes:** 367 | Check logs: 368 | 369 | ```bash 370 | # Enable debug logging 371 | export BIOMCP_LOG_LEVEL=DEBUG 372 | uv run --with biomcp-python biomcp run 373 | ``` 374 | 375 | Common fixes: 376 | 377 | - Update to latest version: `uv tool install biomcp --force` 378 | - Clear cache: `rm -rf ~/.biomcp/cache` 379 | - Check port conflicts: `lsof -i :8000` 380 | 381 | ## Data Quality Issues 382 | 383 | ### Outdated Results 384 | 385 | **Check data freshness:** 386 | 387 | ```bash 388 | # See when databases were last updated 389 | biomcp health check --verbose | grep "Last updated" 390 | ``` 391 | 392 | **Clear cache if needed:** 393 | 394 | ```bash 395 | # Remove cached results 396 | rm -rf ~/.biomcp/cache 397 | 398 | # Or set cache TTL 399 | export BIOMCP_CACHE_TTL=900 # 15 minutes 400 | ``` 401 | 402 | ### Missing Annotations 403 | 404 | **PubTator3 annotations missing:** 405 | 406 | - Some newer articles may not be fully annotated yet 407 | - Try searching by PMID directly 408 | - Check if article is indexed: search by title 409 | 410 | **Variant annotations incomplete:** 411 | 412 | - Not all variants have all annotation types 413 | - Rare variants may lack population frequencies 414 | - Novel variants won't have ClinVar data 415 | 416 | ## Error Messages 417 | 418 | ### Common Error Codes 419 | 420 | **HTTP 429 - Rate Limit Exceeded:** 421 | 422 | ```bash 423 | # Add delay between requests 424 | biomcp article search --gene BRAF --delay 1000 # 1 second 425 | 426 | # Or reduce parallel requests 427 | export BIOMCP_MAX_CONCURRENT=2 428 | ``` 429 | 430 | **HTTP 404 - Not Found:** 431 | 432 | - Check identifier format (PMID, NCT ID, etc.) 433 | - Verify record exists in source database 434 | - Try alternative identifiers 435 | 436 | **HTTP 500 - Server Error:** 437 | 438 | - External API may be down 439 | - Check status: `biomcp health check` 440 | - Try again later 441 | 442 | ### Debugging 443 | 444 | **Enable verbose logging:** 445 | 446 | ```bash 447 | # Set log level 448 | export BIOMCP_LOG_LEVEL=DEBUG 449 | 450 | # Run with verbose output 451 | biomcp article search --gene BRAF --verbose 452 | 453 | # Check log files 454 | tail -f ~/.biomcp/logs/biomcp.log 455 | ``` 456 | 457 | **Report bugs:** 458 | Include when reporting issues: 459 | 460 | 1. BioMCP version: `biomcp --version` 461 | 2. Full error message and stack trace 462 | 3. Command that caused the error 463 | 4. Operating system and Python version 464 | 5. Relevant environment variables 465 | 466 | Report at: https://github.com/genomoncology/biomcp/issues 467 | 468 | ## Getting Help 469 | 470 | ### Quick Checks 471 | 472 | 1. **Check FAQ first**: [Frequently Asked Questions](faq-condensed.md) 473 | 2. **Search existing issues**: [GitHub Issues](https://github.com/genomoncology/biomcp/issues) 474 | 3. **Check examples**: [How-to Guides](how-to-guides/01-find-articles-and-cbioportal-data.md) 475 | 476 | ### Community Support 477 | 478 | - Issue Tracker: Report bugs, request features 479 | - Documentation: PRs welcome for improvements 480 | 481 | ### Professional Support 482 | 483 | For commercial support, contact: [email protected] 484 | 485 | --- 486 | 487 | _Still having issues? [Open a GitHub issue](https://github.com/genomoncology/biomcp/issues/new) with details._ 488 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_external_integration.py: -------------------------------------------------------------------------------- ```python 1 | """Integration tests for external variant data sources with real API calls.""" 2 | 3 | import pytest 4 | 5 | from biomcp.variants.cbio_external_client import CBioPortalExternalClient 6 | from biomcp.variants.external import ( 7 | ExternalVariantAggregator, 8 | TCGAClient, 9 | ThousandGenomesClient, 10 | ) 11 | 12 | 13 | class TestTCGAIntegration: 14 | """Integration tests for TCGA/GDC API.""" 15 | 16 | @pytest.mark.asyncio 17 | @pytest.mark.integration 18 | async def test_braf_v600e_variant(self): 19 | """Test fetching BRAF V600E data from TCGA.""" 20 | client = TCGAClient() 21 | 22 | # Try different formats 23 | variants_to_test = [ 24 | "BRAF V600E", # Gene AA change format that TCGA supports 25 | "chr7:g.140453136A>T", 26 | "7:g.140453136A>T", 27 | ] 28 | 29 | found_data = False 30 | for variant in variants_to_test: 31 | result = await client.get_variant_data(variant) 32 | if result: 33 | found_data = True 34 | # BRAF V600E is common in melanoma and thyroid cancer 35 | assert result.tumor_types is not None 36 | assert len(result.tumor_types) > 0 37 | # Should have affected cases if data found 38 | if result.affected_cases: 39 | assert result.affected_cases > 0 40 | break 41 | 42 | # Note: TCGA might not have data for all variants 43 | if not found_data: 44 | pytest.skip("TCGA API did not return data for BRAF V600E variants") 45 | 46 | @pytest.mark.asyncio 47 | @pytest.mark.integration 48 | async def test_tp53_variant(self): 49 | """Test fetching TP53 variant data from TCGA.""" 50 | client = TCGAClient() 51 | 52 | # TP53 R273H - common tumor suppressor mutation 53 | result = await client.get_variant_data("chr17:g.7577120G>A") 54 | 55 | # TP53 mutations are very common in cancer 56 | if result: 57 | assert result.tumor_types is not None 58 | assert len(result.tumor_types) > 0 59 | 60 | @pytest.mark.asyncio 61 | @pytest.mark.integration 62 | async def test_nonexistent_variant(self): 63 | """Test TCGA response for non-existent variant.""" 64 | client = TCGAClient() 65 | 66 | # Made-up variant that shouldn't exist 67 | result = await client.get_variant_data("chr99:g.999999999A>T") 68 | 69 | assert result is None 70 | 71 | 72 | class TestThousandGenomesIntegration: 73 | """Integration tests for 1000 Genomes via Ensembl REST API.""" 74 | 75 | @pytest.mark.asyncio 76 | @pytest.mark.integration 77 | async def test_common_variant_with_rsid(self): 78 | """Test fetching common variant data by rsID.""" 79 | client = ThousandGenomesClient() 80 | 81 | # rs113488022 is BRAF V600E 82 | result = await client.get_variant_data("rs113488022") 83 | 84 | if result: 85 | # This is a rare variant, so MAF should be low or None 86 | if result.global_maf is not None: 87 | assert result.global_maf < 0.01 # Less than 1% 88 | 89 | # Consequence information might not be available for all variants 90 | # Just verify the data structure is correct 91 | assert hasattr(result, "most_severe_consequence") 92 | 93 | @pytest.mark.asyncio 94 | @pytest.mark.integration 95 | async def test_variant_population_frequencies(self): 96 | """Test population frequency data retrieval.""" 97 | client = ThousandGenomesClient() 98 | 99 | # Use a more common variant for testing population frequencies 100 | # rs1800734 - common variant in MLH1 promoter 101 | result = await client.get_variant_data("rs1800734") 102 | 103 | if result: 104 | # Should have at least global MAF 105 | assert result.global_maf is not None 106 | assert 0 <= result.global_maf <= 1 107 | 108 | # Check that we get population-specific frequencies 109 | pop_freqs = [ 110 | result.afr_maf, 111 | result.amr_maf, 112 | result.eas_maf, 113 | result.eur_maf, 114 | result.sas_maf, 115 | ] 116 | 117 | # At least some populations should have data 118 | non_null_freqs = [f for f in pop_freqs if f is not None] 119 | assert len(non_null_freqs) > 0 120 | 121 | @pytest.mark.asyncio 122 | @pytest.mark.integration 123 | async def test_invalid_variant_id(self): 124 | """Test 1000 Genomes response for invalid variant.""" 125 | client = ThousandGenomesClient() 126 | 127 | # Invalid rsID 128 | result = await client.get_variant_data("rs999999999999") 129 | 130 | assert result is None 131 | 132 | 133 | class TestCBioPortalIntegration: 134 | """Integration tests for cBioPortal API.""" 135 | 136 | @pytest.mark.asyncio 137 | @pytest.mark.integration 138 | async def test_braf_v600e_variant(self): 139 | """Test fetching BRAF V600E data from cBioPortal.""" 140 | client = CBioPortalExternalClient() 141 | 142 | result = await client.get_variant_data("BRAF V600E") 143 | 144 | if result: 145 | # BRAF V600E is common in melanoma and other cancers 146 | assert result.total_cases is not None 147 | assert result.total_cases > 0 148 | assert len(result.studies) > 0 149 | # Should have data from various studies 150 | print( 151 | f"Found {result.total_cases} cases in {len(result.studies)} studies: {result.studies}" 152 | ) 153 | 154 | # Check enhanced fields 155 | assert result.cancer_type_distribution is not None 156 | assert len(result.cancer_type_distribution) > 0 157 | print( 158 | f"Cancer types: {list(result.cancer_type_distribution.keys())}" 159 | ) 160 | 161 | assert result.mutation_types is not None 162 | assert "Missense_Mutation" in result.mutation_types 163 | 164 | assert result.mean_vaf is not None 165 | print(f"Mean VAF: {result.mean_vaf}") 166 | else: 167 | pytest.skip("cBioPortal API did not return data for BRAF V600E") 168 | 169 | @pytest.mark.asyncio 170 | @pytest.mark.integration 171 | async def test_kras_g12d_variant(self): 172 | """Test fetching KRAS G12D data from cBioPortal.""" 173 | client = CBioPortalExternalClient() 174 | 175 | result = await client.get_variant_data("KRAS G12D") 176 | 177 | if result: 178 | # KRAS G12D is a common mutation in multiple cancer types 179 | assert result.total_cases is not None 180 | assert result.total_cases > 0 181 | assert len(result.studies) > 0 182 | else: 183 | pytest.skip("cBioPortal API did not return data for KRAS G12D") 184 | 185 | @pytest.mark.asyncio 186 | @pytest.mark.integration 187 | async def test_invalid_variant(self): 188 | """Test cBioPortal response for invalid variant.""" 189 | client = CBioPortalExternalClient() 190 | 191 | # Invalid gene name 192 | result = await client.get_variant_data("FAKEGENE V600E") 193 | 194 | assert result is None 195 | 196 | 197 | class TestExternalVariantAggregatorIntegration: 198 | """Integration tests for the external variant aggregator.""" 199 | 200 | @pytest.mark.asyncio 201 | @pytest.mark.integration 202 | async def test_aggregate_all_sources(self): 203 | """Test aggregating data from all available sources.""" 204 | aggregator = ExternalVariantAggregator() 205 | 206 | # Use rs1045642 which is a common variant that should have 1000 Genomes data 207 | # Also provide variant data for cBioPortal 208 | variant_data = { 209 | "cadd": {"gene": {"genename": "ABCB1"}}, 210 | "docm": {"aa_change": "p.I1145I"}, 211 | } 212 | 213 | result = await aggregator.get_enhanced_annotations( 214 | "rs1045642", 215 | include_tcga=True, 216 | include_1000g=True, 217 | include_cbioportal=True, 218 | variant_data=variant_data, 219 | ) 220 | 221 | assert result.variant_id == "rs1045642" 222 | 223 | # Check which sources returned data 224 | sources_with_data = [] 225 | if result.tcga: 226 | sources_with_data.append("tcga") 227 | if result.thousand_genomes: 228 | sources_with_data.append("1000g") 229 | if result.cbioportal: 230 | sources_with_data.append("cbioportal") 231 | 232 | # This common variant should have at least 1000 Genomes data 233 | assert len(sources_with_data) > 0 234 | # Specifically, it should have 1000 Genomes data 235 | assert result.thousand_genomes is not None 236 | 237 | # No errors should be reported for successful queries 238 | # (though some sources might not have data, which is different from errors) 239 | assert len(result.error_sources) == 0 240 | 241 | @pytest.mark.asyncio 242 | @pytest.mark.integration 243 | async def test_selective_source_inclusion(self): 244 | """Test including only specific sources.""" 245 | aggregator = ExternalVariantAggregator() 246 | 247 | # Only request 1000 Genomes data 248 | result = await aggregator.get_enhanced_annotations( 249 | "rs1800734", # Common variant 250 | include_tcga=False, 251 | include_1000g=True, 252 | ) 253 | 254 | # Should only attempt to fetch 1000 Genomes data 255 | assert result.tcga is None 256 | # 1000 Genomes might have data for this common variant 257 | # (but it's okay if it doesn't) 258 | 259 | @pytest.mark.asyncio 260 | @pytest.mark.integration 261 | async def test_error_handling_resilience(self): 262 | """Test that aggregator handles individual source failures gracefully.""" 263 | aggregator = ExternalVariantAggregator() 264 | 265 | # Use an invalid variant format that might cause errors 266 | result = await aggregator.get_enhanced_annotations( 267 | "INVALID_VARIANT_FORMAT_12345", 268 | include_tcga=True, 269 | include_1000g=True, 270 | ) 271 | 272 | # Should still return a result even if all sources fail 273 | assert result is not None 274 | assert result.variant_id == "INVALID_VARIANT_FORMAT_12345" 275 | 276 | # Sources should return None or be in error_sources 277 | assert result.tcga is None 278 | assert result.thousand_genomes is None 279 | ``` -------------------------------------------------------------------------------- /docs/tutorials/biothings-prompts.md: -------------------------------------------------------------------------------- ```markdown 1 | # BioThings Integration Example Prompts 2 | 3 | This guide provides example prompts for AI assistants to effectively use the BioThings suite integration in BioMCP. 4 | 5 | ## Overview of BioThings Suite 6 | 7 | BioMCP integrates with the complete BioThings suite of APIs: 8 | 9 | - **MyGene.info** - Gene information and annotations 10 | - **MyDisease.info** - Disease ontology and synonyms 11 | - **MyVariant.info** - Genetic variant annotations (pre-existing integration, enhanced with BioThings client) 12 | - **MyChem.info** - Drug/chemical information and annotations 13 | 14 | All four services share common infrastructure through the BioThings client module, providing consistent error handling, rate limiting, and response parsing. 15 | 16 | ## Gene Information Retrieval 17 | 18 | ### Basic Gene Lookup 19 | 20 | ``` 21 | "What is the TP53 gene?" 22 | "Tell me about BRAF" 23 | "Get information on the EGFR gene" 24 | "What does the BRCA1 gene do?" 25 | ``` 26 | 27 | **Expected tool usage**: `gene_getter("TP53")` → Returns official name, summary, aliases 28 | 29 | ### Gene by ID 30 | 31 | ``` 32 | "Look up gene with Entrez ID 7157" 33 | "What is gene 673?" 34 | ``` 35 | 36 | **Expected tool usage**: `gene_getter("7157")` → Returns TP53 information 37 | 38 | ### Gene Context for Research 39 | 40 | ``` 41 | "I need to understand the KRAS gene before searching for mutations" 42 | "What type of protein does BRAF encode?" 43 | "Give me the official name and aliases for MYC" 44 | ``` 45 | 46 | ## Disease Information Retrieval 47 | 48 | ### Basic Disease Lookup 49 | 50 | ``` 51 | "What is GIST?" 52 | "Tell me about melanoma" 53 | "Define non-small cell lung cancer" 54 | "What is Erdheim-Chester disease?" 55 | ``` 56 | 57 | **Expected tool usage**: `disease_getter("GIST")` → Returns definition, synonyms, ontology IDs 58 | 59 | ### Disease by Ontology ID 60 | 61 | ``` 62 | "Look up disease MONDO:0018076" 63 | "What is DOID:1909?" 64 | ``` 65 | 66 | **Expected tool usage**: `disease_getter("MONDO:0018076")` → Returns disease information 67 | 68 | ### Disease Synonyms for Research 69 | 70 | ``` 71 | "What are all the names for gastrointestinal stromal tumor?" 72 | "Find synonyms for NSCLC" 73 | "What other terms are used for melanoma?" 74 | ``` 75 | 76 | ## Variant Information Retrieval (MyVariant.info) 77 | 78 | MyVariant.info is part of the BioThings suite and provides comprehensive variant annotations. BioMCP has extensive integration with specialized features: 79 | 80 | ### Basic Variant Lookup 81 | 82 | ``` 83 | "Get information about rs7412" 84 | "What is the BRAF V600E variant?" 85 | "Look up variant chr7:140453136-140453136" 86 | ``` 87 | 88 | **Expected tool usage**: `variant_getter("rs7412")` → Returns variant annotations with external database links 89 | 90 | ### Variant Search with Filters 91 | 92 | ``` 93 | "Find pathogenic variants in TP53" 94 | "Search for BRCA1 variants with high impact" 95 | "Get all loss-of-function variants in KRAS" 96 | ``` 97 | 98 | **Expected tool usage**: `variant_searcher(gene="TP53", significance="pathogenic")` → Returns filtered variant list 99 | 100 | ### Variant with Cancer Context 101 | 102 | ``` 103 | "What cancer types have BRAF V600E mutations?" 104 | "Get TCGA data for TP53 R273H" 105 | ``` 106 | 107 | **Expected tool usage**: Variant tools automatically integrate cBioPortal, TCGA, and 1000 Genomes data when available 108 | 109 | ## Drug Information Retrieval (MyChem.info) 110 | 111 | MyChem.info is part of the BioThings suite and provides comprehensive drug/chemical information. 112 | 113 | ### Basic Drug Lookup 114 | 115 | ``` 116 | "What is imatinib?" 117 | "Tell me about aspirin" 118 | "Get information on pembrolizumab" 119 | "What does metformin do?" 120 | ``` 121 | 122 | **Expected tool usage**: `drug_getter("imatinib")` → Returns drug information with database links 123 | 124 | ### Drug by ID 125 | 126 | ``` 127 | "Look up DrugBank ID DB00619" 128 | "What is CHEMBL941?" 129 | "Get details for PubChem CID 5291" 130 | ``` 131 | 132 | **Expected tool usage**: `drug_getter("DB00619")` → Returns drug details by identifier 133 | 134 | ### Drug Properties and Mechanism 135 | 136 | ``` 137 | "What is the mechanism of action of imatinib?" 138 | "Find the chemical formula for aspirin" 139 | "What are the trade names for adalimumab?" 140 | "How does pembrolizumab work?" 141 | ``` 142 | 143 | **Expected tool usage**: `drug_getter("pembrolizumab")` → Returns mechanism, indications, and properties 144 | 145 | ## Integrated Research Workflows 146 | 147 | ### Variant Analysis with Gene Context 148 | 149 | ``` 150 | "Analyze the BRAF V600E mutation - first tell me about the gene, then find pathogenic variants" 151 | ``` 152 | 153 | **Expected tool sequence**: 154 | 155 | 1. `think(thought="Analyzing BRAF V600E mutation", thoughtNumber=1)` 156 | 2. `gene_getter("BRAF")` → Gene context 157 | 3. `variant_searcher(gene="BRAF", hgvsp="V600E", significance="pathogenic")` → Variant details 158 | 159 | ### Clinical Trial Search with Disease Expansion 160 | 161 | ``` 162 | "Find clinical trials for GIST patients" 163 | "Search for trials treating gastrointestinal stromal tumors" 164 | ``` 165 | 166 | **Expected tool usage**: 167 | 168 | - `trial_searcher(conditions=["GIST"], expand_synonyms=True)` 169 | - Automatically searches for: GIST OR "gastrointestinal stromal tumor" OR "GI stromal tumor" 170 | 171 | ### Comprehensive Gene-Disease Research 172 | 173 | ``` 174 | "I'm researching EGFR mutations in lung cancer. Start with the gene, then the disease, then find relevant trials" 175 | ``` 176 | 177 | **Expected tool sequence**: 178 | 179 | 1. `think(thought="Researching EGFR in lung cancer", thoughtNumber=1)` 180 | 2. `gene_getter("EGFR")` → Gene information 181 | 3. `disease_getter("lung cancer")` → Disease context and synonyms 182 | 4. `trial_searcher(conditions=["lung cancer"], interventions=["EGFR inhibitor"])` → Trials with synonym expansion 183 | 184 | ### Multi-Gene Analysis 185 | 186 | ``` 187 | "Compare TP53, BRAF, and KRAS genes" 188 | "Tell me about the RAS family genes: KRAS, NRAS, HRAS" 189 | ``` 190 | 191 | **Expected tool usage**: Multiple `gene_getter()` calls for each gene 192 | 193 | ## Advanced Use Cases 194 | 195 | ### Gene Alias Resolution 196 | 197 | ``` 198 | "What is the official name for the p53 gene?" 199 | "Is TRP53 the same as TP53?" 200 | ``` 201 | 202 | **Expected tool usage**: `gene_getter("p53")` → Will resolve to TP53 203 | 204 | ### Disease Name Disambiguation 205 | 206 | ``` 207 | "Is GIST the same as gastrointestinal stromal tumor?" 208 | "What's the MONDO ID for melanoma?" 209 | ``` 210 | 211 | **Expected tool usage**: `disease_getter("GIST")` → Shows all synonyms and IDs 212 | 213 | ### Trial Search Without Synonym Expansion 214 | 215 | ``` 216 | "Find trials specifically mentioning 'GIST' not other names" 217 | ``` 218 | 219 | **Expected tool usage**: `trial_searcher(conditions=["GIST"], expand_synonyms=False)` 220 | 221 | ### Integrated Literature and Gene Search 222 | 223 | ``` 224 | "Find recent papers about TP53 mutations - first tell me about the gene" 225 | ``` 226 | 227 | **Expected tool sequence**: 228 | 229 | 1. `gene_getter("TP53")` → Gene context 230 | 2. `article_searcher(genes=["TP53"], keywords=["mutation"])` → Literature 231 | 232 | ### Drug-Target Research 233 | 234 | ``` 235 | "I'm researching imatinib for CML treatment. Get drug info, then find trials" 236 | "What targets does pembrolizumab hit? Then find related articles" 237 | ``` 238 | 239 | **Expected tool sequence**: 240 | 241 | 1. `think(thought="Researching imatinib for CML", thoughtNumber=1)` 242 | 2. `drug_getter("imatinib")` → Drug information and mechanism 243 | 3. `trial_searcher(interventions=["imatinib"], conditions=["chronic myeloid leukemia"])` 244 | 245 | ## Tips for AI Assistants 246 | 247 | 1. **Always use think() first** for complex biomedical queries 248 | 2. **Gene context helps interpretation**: Get gene info before analyzing variants 249 | 3. **Disease synonyms improve search**: Use expand_synonyms=True (default) for comprehensive results 250 | 4. **Drug mechanisms matter**: Get drug info before searching trials to understand targets 251 | 5. **Real-time data**: All BioThings data is fetched live, ensuring current information 252 | 6. **Combine tools**: Gene + disease + variant + drug tools work together for comprehensive analysis 253 | 254 | ## Common Patterns 255 | 256 | ### Pattern 1: Gene → Variant → Clinical Impact 257 | 258 | ``` 259 | gene_getter("BRAF") → 260 | variant_searcher(gene="BRAF", significance="pathogenic") → 261 | article_searcher(genes=["BRAF"], diseases=["melanoma"]) 262 | ``` 263 | 264 | ### Pattern 2: Disease → Trials → Locations 265 | 266 | ``` 267 | disease_getter("NSCLC") → 268 | trial_searcher(conditions=["NSCLC"], expand_synonyms=True) → 269 | trial_locations_getter(nct_id="NCT...") 270 | ``` 271 | 272 | ### Pattern 3: Multi-Gene Pathway Analysis 273 | 274 | ``` 275 | gene_getter("EGFR") → 276 | gene_getter("KRAS") → 277 | gene_getter("BRAF") → 278 | article_searcher(genes=["EGFR", "KRAS", "BRAF"], keywords=["pathway"]) 279 | ``` 280 | 281 | ## Unified Search with BioThings Domains 282 | 283 | BioMCP's unified search now supports gene, drug, and disease domains alongside articles, trials, and variants: 284 | 285 | ### Domain-Specific Search 286 | 287 | ``` 288 | "Search for BRAF in the gene domain" 289 | "Find imatinib in drugs" 290 | "Look up melanoma in diseases" 291 | ``` 292 | 293 | **Expected tool usage**: 294 | 295 | - `search(domain="gene", keywords=["BRAF"])` 296 | - `search(domain="drug", keywords=["imatinib"])` 297 | - `search(domain="disease", keywords=["melanoma"])` 298 | 299 | ### Unified Query Language with BioThings 300 | 301 | ``` 302 | "genes.symbol:BRAF AND genes.type:protein-coding" 303 | "drugs.tradename:gleevec" 304 | "diseases.name:melanoma OR diseases.synonym:malignant melanoma" 305 | ``` 306 | 307 | **Expected tool usage**: Query parser automatically routes to appropriate domains 308 | 309 | ### Cross-Domain Gene Searches 310 | 311 | ``` 312 | "gene:BRAF" # Searches articles, variants, genes, and trials 313 | "Search everything about TP53" 314 | ``` 315 | 316 | **Expected behavior**: 317 | 318 | - Gene queries trigger searches across multiple domains 319 | - Results include gene info, variants, articles, and related trials 320 | 321 | ### Cross-Domain Disease Searches 322 | 323 | ``` 324 | "disease:melanoma" # Searches articles, trials, and diseases 325 | "Find all information about NSCLC" 326 | ``` 327 | 328 | **Expected behavior**: 329 | 330 | - Disease queries search articles, trials, and disease databases 331 | - Disease synonyms are automatically expanded in trial searches 332 | 333 | ### Combined Domain Queries 334 | 335 | ``` 336 | "gene:BRAF AND disease:melanoma" 337 | "drugs.indication:leukemia AND trials.phase:3" 338 | "genes.symbol:EGFR AND articles.year:>2023" 339 | ``` 340 | 341 | ### Unified Fetch 342 | 343 | ``` 344 | "Fetch BRAF from gene domain" 345 | "Get imatinib details from drugs" 346 | "Retrieve melanoma information from diseases" 347 | ``` 348 | 349 | **Expected tool usage**: 350 | 351 | - `fetch(id="BRAF", domain="gene")` 352 | - `fetch(id="imatinib", domain="drug")` 353 | - `fetch(id="melanoma", domain="disease")` 354 | 355 | ## Error Handling 356 | 357 | If a gene/disease is not found: 358 | 359 | - Check for typos or alternative names 360 | - Try searching with partial names 361 | - Use official symbols for genes (e.g., "TP53" not "p53 gene") 362 | - For diseases, try both common and medical names 363 | ``` -------------------------------------------------------------------------------- /src/biomcp/constants.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Central constants file for BioMCP. 3 | 4 | This module contains all constants used throughout the BioMCP codebase, 5 | including API URLs, default values, limits, and domain configurations. 6 | """ 7 | 8 | # ============================================================================ 9 | # API Base URLs 10 | # ============================================================================ 11 | 12 | # PubTator3 API 13 | # https://www.ncbi.nlm.nih.gov/research/pubtator3/api 14 | PUBTATOR3_BASE_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api" 15 | PUBTATOR3_SEARCH_URL = f"{PUBTATOR3_BASE_URL}/search/" 16 | PUBTATOR3_FULLTEXT_URL = f"{PUBTATOR3_BASE_URL}/publications/export/biocjson" 17 | PUBTATOR3_AUTOCOMPLETE_URL = f"{PUBTATOR3_BASE_URL}/entity/autocomplete/" 18 | 19 | # ClinicalTrials.gov API 20 | # https://clinicaltrials.gov/data-api/api 21 | CLINICAL_TRIALS_BASE_URL = "https://clinicaltrials.gov/api/v2/studies" 22 | CLINICAL_TRIALS_STUDY_URL = "https://clinicaltrials.gov/study/" 23 | 24 | # NCI Clinical Trials Search API 25 | # https://clinicaltrialsapi.cancer.gov/api/v2 26 | NCI_CTS_BASE_URL = "https://clinicaltrialsapi.cancer.gov/api/v2" 27 | NCI_TRIALS_URL = f"{NCI_CTS_BASE_URL}/trials" 28 | NCI_ORGANIZATIONS_URL = f"{NCI_CTS_BASE_URL}/organizations" 29 | NCI_DISEASES_URL = f"{NCI_CTS_BASE_URL}/diseases" 30 | NCI_INTERVENTIONS_URL = f"{NCI_CTS_BASE_URL}/interventions" 31 | NCI_BIOMARKERS_URL = f"{NCI_CTS_BASE_URL}/biomarkers" 32 | NCI_API_KEY_ENV = "NCI_API_KEY" 33 | 34 | # MyVariant.info API 35 | # https://docs.myvariant.info/ 36 | MYVARIANT_BASE_URL = "https://myvariant.info/v1" 37 | MYVARIANT_QUERY_URL = f"{MYVARIANT_BASE_URL}/query" 38 | MYVARIANT_GET_URL = f"{MYVARIANT_BASE_URL}/variant" 39 | 40 | # Preprint Server APIs 41 | BIORXIV_BASE_URL = "https://api.biorxiv.org/details/biorxiv" 42 | MEDRXIV_BASE_URL = "https://api.biorxiv.org/details/medrxiv" 43 | EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" 44 | 45 | # External Variant APIs 46 | GDC_BASE_URL = "https://api.gdc.cancer.gov" 47 | GDC_SSMS_ENDPOINT_URL = f"{GDC_BASE_URL}/ssms" # Simple Somatic Mutations 48 | GDC_SSM_OCCURRENCES_URL = f"{GDC_BASE_URL}/ssm_occurrences" 49 | ENSEMBL_REST_BASE_URL = "https://rest.ensembl.org" 50 | ENSEMBL_VARIATION_URL = f"{ENSEMBL_REST_BASE_URL}/variation/human" 51 | CBIOPORTAL_BASE_URL = "https://www.cbioportal.org/api" 52 | 53 | # External Resource URLs 54 | PUBMED_BASE_URL = "https://pubmed.ncbi.nlm.nih.gov/" 55 | PMC_BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/articles/" 56 | DOI_BASE_URL = "https://doi.org/" 57 | DBSNP_BASE_URL = "https://www.ncbi.nlm.nih.gov/snp/" 58 | CLINVAR_BASE_URL = "https://www.ncbi.nlm.nih.gov/clinvar/variation/" 59 | COSMIC_BASE_URL = "https://cancer.sanger.ac.uk/cosmic/mutation/overview?id=" 60 | CIVIC_BASE_URL = "https://civicdb.org/variants/" 61 | ENSEMBL_VARIANT_BASE_URL = ( 62 | "https://ensembl.org/Homo_sapiens/Variation/Explore?v=" 63 | ) 64 | GENENAMES_BASE_URL = ( 65 | "https://www.genenames.org/data/gene-symbol-report/#!/symbol/" 66 | ) 67 | UCSC_GENOME_BROWSER_URL = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&" 68 | 69 | # ============================================================================ 70 | # Default Values and Limits 71 | # ============================================================================ 72 | 73 | # Caching 74 | DEFAULT_CACHE_TIMEOUT = 60 * 60 * 24 * 7 # 1 week in seconds 75 | 76 | # Pagination 77 | SYSTEM_PAGE_SIZE = ( 78 | 10 # Default page size for all searches (reduced for token efficiency) 79 | ) 80 | DEFAULT_PAGE_SIZE = 10 # Default page size for unified search 81 | MIN_PAGE_SIZE = 1 82 | MAX_PAGE_SIZE = 100 83 | DEFAULT_PAGE_NUMBER = 1 84 | 85 | # Search limits 86 | MAX_RESULTS_PER_DOMAIN_DEFAULT = ( 87 | 10 # Default max results per domain in unified search 88 | ) 89 | ESTIMATED_ADDITIONAL_RESULTS = ( 90 | 100 # Estimate for additional results when full page returned 91 | ) 92 | DEFAULT_AUTOCOMPLETE_LIMIT = 1 93 | MAX_AUTOCOMPLETE_LIMIT = 100 94 | 95 | # Text display 96 | MAX_WIDTH = 72 # Maximum width for text wrapping in console output 97 | SNIPPET_LENGTH = 200 # Maximum length for text snippets in search results 98 | 99 | # Genome Assembly 100 | DEFAULT_ASSEMBLY = "hg19" # Default genome assembly for MyVariant.info API 101 | 102 | # Rate Limiting 103 | DEFAULT_RATE_LIMIT_PER_SECOND = 10.0 104 | DEFAULT_BURST_SIZE = 20 105 | SLIDING_WINDOW_MINUTE_LIMIT = 60 106 | SLIDING_WINDOW_HOUR_LIMIT = 1000 107 | 108 | # Retry Configuration 109 | DEFAULT_MAX_RETRY_ATTEMPTS = 3 110 | DEFAULT_INITIAL_RETRY_DELAY = 1.0 111 | DEFAULT_MAX_RETRY_DELAY = 60.0 112 | DEFAULT_EXPONENTIAL_BASE = 2.0 113 | AGGRESSIVE_MAX_RETRY_ATTEMPTS = 5 114 | AGGRESSIVE_INITIAL_RETRY_DELAY = 2.0 115 | AGGRESSIVE_MAX_RETRY_DELAY = 30.0 116 | 117 | # Circuit Breaker Configuration 118 | DEFAULT_FAILURE_THRESHOLD = 10 119 | DEFAULT_RECOVERY_TIMEOUT = 30.0 120 | DEFAULT_SUCCESS_THRESHOLD = 3 121 | 122 | # Metrics Configuration 123 | MAX_METRIC_SAMPLES = 1000 124 | METRIC_PERCENTILE_50 = 0.50 125 | METRIC_PERCENTILE_95 = 0.95 126 | METRIC_PERCENTILE_99 = 0.99 127 | METRIC_JITTER_RANGE = 0.1 # 10% jitter 128 | 129 | # HTTP Client Configuration 130 | HTTP_TIMEOUT_SECONDS = 120.0 131 | HTTP_ERROR_CODE_NETWORK = 599 132 | HTTP_ERROR_CODE_UNSUPPORTED_METHOD = 405 133 | 134 | # Batching and Pagination Configuration 135 | DEFAULT_BATCH_SIZE = 10 136 | DEFAULT_BATCH_TIMEOUT = 0.1 137 | CBIOPORTAL_BATCH_SIZE = 5 138 | EUROPE_PMC_PAGE_SIZE = 25 139 | BIORXIV_MAX_PAGES = 3 140 | BIORXIV_RESULTS_PER_PAGE = 30 141 | BIORXIV_DEFAULT_DAYS_BACK = 365 142 | 143 | # Prefetching Configuration 144 | PREFETCH_TOP_GENES = 5 145 | PREFETCH_TOP_DISEASES = 3 146 | PREFETCH_TOP_CHEMICALS = 3 147 | PREFETCH_TIMEOUT = 2.0 148 | 149 | # Cache Configuration 150 | REQUEST_CACHE_MAX_SIZE = 1000 151 | CACHE_KEY_SAMPLE_SIZE = 100 152 | 153 | # Connection Pool Configuration 154 | CONNECTION_POOL_MAX_KEEPALIVE = 20 155 | CONNECTION_POOL_MAX_CONNECTIONS = 100 156 | CONNECTION_POOL_KEEPALIVE_EXPIRY = 30 157 | 158 | # ============================================================================ 159 | # Domain Configuration 160 | # ============================================================================ 161 | 162 | # Valid domains for search 163 | VALID_DOMAINS = [ 164 | "article", 165 | "trial", 166 | "variant", 167 | "gene", 168 | "drug", 169 | "disease", 170 | "nci_organization", 171 | "nci_intervention", 172 | "nci_biomarker", 173 | "nci_disease", 174 | # OpenFDA domains 175 | "fda_adverse", 176 | "fda_label", 177 | "fda_device", 178 | "fda_approval", 179 | "fda_recall", 180 | "fda_shortage", 181 | ] 182 | VALID_DOMAINS_PLURAL = [ 183 | "articles", 184 | "trials", 185 | "variants", 186 | "genes", 187 | "drugs", 188 | "diseases", 189 | "nci_organizations", 190 | "nci_interventions", 191 | "nci_biomarkers", 192 | "nci_diseases", 193 | # OpenFDA domains 194 | "fda_adverse_events", 195 | "fda_labels", 196 | "fda_device_events", 197 | "fda_approvals", 198 | "fda_recalls", 199 | "fda_shortages", 200 | ] 201 | 202 | # Domain mappings for unified search 203 | DOMAIN_TO_PLURAL = { 204 | "article": "articles", 205 | "trial": "trials", 206 | "variant": "variants", 207 | "gene": "genes", 208 | "drug": "drugs", 209 | "disease": "diseases", 210 | "nci_organization": "nci_organizations", 211 | "nci_intervention": "nci_interventions", 212 | "nci_biomarker": "nci_biomarkers", 213 | "nci_disease": "nci_diseases", 214 | # OpenFDA domains 215 | "fda_adverse": "fda_adverse_events", 216 | "fda_label": "fda_labels", 217 | "fda_device": "fda_device_events", 218 | "fda_approval": "fda_approvals", 219 | "fda_recall": "fda_recalls", 220 | "fda_shortage": "fda_shortages", 221 | } 222 | 223 | PLURAL_TO_DOMAIN = { 224 | "articles": "article", 225 | "trials": "trial", 226 | "variants": "variant", 227 | "genes": "gene", 228 | "drugs": "drug", 229 | "diseases": "disease", 230 | "nci_organizations": "nci_organization", 231 | "nci_interventions": "nci_intervention", 232 | "nci_biomarkers": "nci_biomarker", 233 | "nci_diseases": "nci_disease", 234 | # OpenFDA domains 235 | "fda_adverse_events": "fda_adverse", 236 | "fda_labels": "fda_label", 237 | "fda_device_events": "fda_device", 238 | "fda_approvals": "fda_approval", 239 | "fda_recalls": "fda_recall", 240 | "fda_shortages": "fda_shortage", 241 | } 242 | 243 | # Trial detail sections 244 | TRIAL_DETAIL_SECTIONS = [ 245 | "protocol", 246 | "locations", 247 | "outcomes", 248 | "references", 249 | "all", 250 | "full", 251 | ] 252 | 253 | # ============================================================================ 254 | # Field Names and Enums 255 | # ============================================================================ 256 | 257 | # Autocomplete concept types 258 | AUTOCOMPLETE_CONCEPTS = ["variant", "chemical", "disease", "gene"] 259 | 260 | # HTTP methods 261 | VALID_HTTP_METHODS = ["GET", "POST"] 262 | 263 | # Trial search defaults 264 | DEFAULT_TRIAL_FORMAT = "csv" 265 | DEFAULT_TRIAL_MARKUP = "markdown" 266 | 267 | # ============================================================================ 268 | # Error Messages 269 | # ============================================================================ 270 | 271 | ERROR_THOUGHT_NUMBER_MIN = "Error: thoughtNumber must be >= 1" 272 | ERROR_TOTAL_THOUGHTS_MIN = "Error: totalThoughts must be >= 1" 273 | ERROR_DOMAIN_REQUIRED = "Either 'query' or 'domain' parameter must be provided" 274 | ERROR_THOUGHT_REQUIRED = ( 275 | "'thought' parameter is required when domain='thinking'" 276 | ) 277 | ERROR_THOUGHT_NUMBER_REQUIRED = ( 278 | "'thoughtNumber' parameter is required when domain='thinking'" 279 | ) 280 | ERROR_TOTAL_THOUGHTS_REQUIRED = ( 281 | "'totalThoughts' parameter is required when domain='thinking'" 282 | ) 283 | ERROR_NEXT_THOUGHT_REQUIRED = ( 284 | "'nextThoughtNeeded' parameter is required when domain='thinking'" 285 | ) 286 | 287 | # ============================================================================ 288 | # API Response Formatting 289 | # ============================================================================ 290 | 291 | # Default values for missing data 292 | DEFAULT_TITLE = "Untitled" 293 | DEFAULT_GENE = "Unknown" 294 | DEFAULT_SIGNIFICANCE = "Unknown" 295 | 296 | # Metadata field names 297 | METADATA_YEAR = "year" 298 | METADATA_JOURNAL = "journal" 299 | METADATA_AUTHORS = "authors" 300 | METADATA_STATUS = "status" 301 | METADATA_PHASE = "phase" 302 | METADATA_START_DATE = "start_date" 303 | METADATA_COMPLETION_DATE = "primary_completion_date" 304 | METADATA_GENE = "gene" 305 | METADATA_RSID = "rsid" 306 | METADATA_SIGNIFICANCE = "clinical_significance" 307 | METADATA_CONSEQUENCE = "consequence" 308 | METADATA_SOURCE = "source" 309 | 310 | # Result field names 311 | RESULT_ID = "id" 312 | RESULT_TITLE = "title" 313 | RESULT_SNIPPET = "snippet" # Internal use for domain handlers 314 | RESULT_TEXT = "text" # OpenAI MCP compliant field name 315 | RESULT_URL = "url" 316 | RESULT_METADATA = "metadata" 317 | RESULT_DATA = "data" 318 | RESULT_PAGE = "page" 319 | RESULT_PAGE_SIZE = "page_size" 320 | RESULT_TOTAL = "total" 321 | RESULT_NEXT_PAGE = "next_page" 322 | ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/05-nci-cts-api.md: -------------------------------------------------------------------------------- ```markdown 1 | # NCI Clinical Trials Search API Reference 2 | 3 | The National Cancer Institute's Clinical Trials Search (CTS) API provides advanced search capabilities for cancer clinical trials with enhanced filtering options beyond ClinicalTrials.gov. 4 | 5 | ## Overview 6 | 7 | The NCI CTS API offers: 8 | 9 | - Advanced biomarker and mutation filtering 10 | - Comprehensive organization database 11 | - Intervention and drug vocabularies 12 | - Disease terminology with NCI Thesaurus integration 13 | - Prior therapy and eligibility criteria 14 | 15 | **Base URL:** `https://clinicaltrialsapi.cancer.gov/api/v2/` 16 | 17 | ## Authentication 18 | 19 | An API key is required for all endpoints. 20 | 21 | ### Obtaining an API Key 22 | 23 | 1. Visit [https://clinicaltrialsapi.cancer.gov/](https://clinicaltrialsapi.cancer.gov/) 24 | 2. Click "Get API Key" 25 | 3. Complete registration 26 | 4. Key is emailed immediately 27 | 28 | ### Using the API Key 29 | 30 | Include in request headers: 31 | 32 | ``` 33 | X-API-KEY: your-api-key-here 34 | ``` 35 | 36 | Or as query parameter: 37 | 38 | ``` 39 | ?api_key=your-api-key-here 40 | ``` 41 | 42 | ## Core Endpoints 43 | 44 | ### 1. Trial Search 45 | 46 | ``` 47 | GET /trials 48 | ``` 49 | 50 | Search for clinical trials with advanced filtering. 51 | 52 | #### Parameters 53 | 54 | **Basic Search:** 55 | 56 | - `keyword`: General text search 57 | - `nct_id`: Specific NCT identifiers 58 | - `diseases`: Disease/condition names 59 | - `interventions`: Treatment names 60 | 61 | **Advanced Filters:** 62 | 63 | - `biomarkers`: Required biomarkers/mutations 64 | - `prior_therapy_required`: true/false 65 | - `accepts_brain_mets`: true/false 66 | - `min_age`: Minimum age in years 67 | - `max_age`: Maximum age in years 68 | 69 | **Pagination:** 70 | 71 | - `size`: Results per page (max 50) 72 | - `from`: Starting index (offset) 73 | 74 | #### Example Request 75 | 76 | ```bash 77 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/trials" \ 78 | -H "X-API-KEY: your-key" \ 79 | -d "diseases=melanoma" \ 80 | -d "biomarkers=BRAF V600E" \ 81 | -d "accepts_brain_mets=true" \ 82 | -d "size=10" 83 | ``` 84 | 85 | #### Response Format 86 | 87 | ```json 88 | { 89 | "total": 42, 90 | "trials": [ 91 | { 92 | "nct_id": "NCT04280705", 93 | "brief_title": "BRAF/MEK Inhibitor Combination", 94 | "current_trial_status": "Active", 95 | "phase": "Phase II", 96 | "biomarker_eligibility": [ 97 | { 98 | "gene": "BRAF", 99 | "variant": "V600E", 100 | "required": true 101 | } 102 | ], 103 | "sites": [...] 104 | } 105 | ] 106 | } 107 | ``` 108 | 109 | ### 2. Trial Details 110 | 111 | ``` 112 | GET /trials/{nct_id} 113 | ``` 114 | 115 | Get comprehensive information about a specific trial. 116 | 117 | #### Example Request 118 | 119 | ```bash 120 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/trials/NCT04280705" \ 121 | -H "X-API-KEY: your-key" 122 | ``` 123 | 124 | ### 3. Organization Search 125 | 126 | ``` 127 | GET /organizations 128 | ``` 129 | 130 | Search for cancer research organizations and treatment centers. 131 | 132 | #### Parameters 133 | 134 | - `name`: Organization name 135 | - `org_city`: City location 136 | - `org_state_or_province`: State/province 137 | - `org_country`: Country 138 | - `org_type`: Type (e.g., "NCI-designated", "academic") 139 | 140 | **Important:** Always use city AND state together to avoid Elasticsearch errors. 141 | 142 | #### Example Request 143 | 144 | ```bash 145 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/organizations" \ 146 | -H "X-API-KEY: your-key" \ 147 | -d "org_city=Houston" \ 148 | -d "org_state_or_province=TX" 149 | ``` 150 | 151 | ### 4. Organization Details 152 | 153 | ``` 154 | GET /organizations/{org_id} 155 | ``` 156 | 157 | Get details about a specific organization. 158 | 159 | ### 5. Intervention Search 160 | 161 | ``` 162 | GET /interventions 163 | ``` 164 | 165 | Search for drugs, devices, and procedures used in trials. 166 | 167 | #### Parameters 168 | 169 | - `name`: Intervention name 170 | - `type`: Drug, Device, Procedure, etc. 171 | - `synonyms`: Include synonym matches (default: true) 172 | 173 | #### Example Request 174 | 175 | ```bash 176 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/interventions" \ 177 | -H "X-API-KEY: your-key" \ 178 | -d "name=pembrolizumab" \ 179 | -d "type=Drug" 180 | ``` 181 | 182 | ### 6. Intervention Details 183 | 184 | ``` 185 | GET /interventions/{intervention_id} 186 | ``` 187 | 188 | ### 7. Biomarker Search 189 | 190 | ``` 191 | GET /biomarkers 192 | ``` 193 | 194 | Search for biomarkers used in trial eligibility criteria. 195 | 196 | #### Parameters 197 | 198 | - `name`: Biomarker name 199 | - `type`: mutation, expression, etc. 200 | - `gene`: Associated gene symbol 201 | 202 | ### 8. Disease Search 203 | 204 | ``` 205 | GET /diseases 206 | ``` 207 | 208 | Search NCI's controlled vocabulary of cancer conditions. 209 | 210 | #### Parameters 211 | 212 | - `name`: Disease name 213 | - `include_synonyms`: Include synonym matches 214 | - `category`: Disease category 215 | 216 | ## Advanced Features 217 | 218 | ### Biomarker-Based Trial Search 219 | 220 | Find trials requiring specific mutations: 221 | 222 | ```python 223 | params = { 224 | "diseases": "non-small cell lung cancer", 225 | "biomarkers": ["EGFR L858R", "EGFR exon 19 deletion"], 226 | "prior_therapy_required": False, 227 | "accepts_brain_mets": True 228 | } 229 | 230 | response = requests.get( 231 | "https://clinicaltrialsapi.cancer.gov/api/v2/trials", 232 | headers={"X-API-KEY": api_key}, 233 | params=params 234 | ) 235 | ``` 236 | 237 | ### Complex Eligibility Queries 238 | 239 | ```python 240 | # Find trials with specific eligibility 241 | params = { 242 | "diseases": "melanoma", 243 | "biomarkers": "BRAF V600E", 244 | "min_age": 18, 245 | "max_age": 75, 246 | "prior_therapy": "vemurafenib", # Exclude if prior vemurafenib 247 | "performance_status": "0-1" # ECOG 0 or 1 248 | } 249 | ``` 250 | 251 | ### Organization Network Analysis 252 | 253 | ```python 254 | # Find all NCI-designated centers in a region 255 | params = { 256 | "org_type": "NCI-designated", 257 | "org_state_or_province": ["CA", "OR", "WA"] # West Coast 258 | } 259 | 260 | orgs = requests.get( 261 | "https://clinicaltrialsapi.cancer.gov/api/v2/organizations", 262 | headers={"X-API-KEY": api_key}, 263 | params=params 264 | ) 265 | 266 | # Get trials at each center 267 | for org in orgs.json()["organizations"]: 268 | trials = requests.get( 269 | f"https://clinicaltrialsapi.cancer.gov/api/v2/trials", 270 | headers={"X-API-KEY": api_key}, 271 | params={"site_org_id": org["id"]} 272 | ) 273 | ``` 274 | 275 | ## Data Models 276 | 277 | ### Trial Object 278 | 279 | ```json 280 | { 281 | "nct_id": "NCT04280705", 282 | "brief_title": "Study Title", 283 | "official_title": "Full Protocol Title", 284 | "current_trial_status": "Active", 285 | "phase": "Phase II", 286 | "study_type": "Interventional", 287 | "primary_purpose": "Treatment", 288 | "diseases": [ 289 | { 290 | "name": "Melanoma", 291 | "nci_thesaurus_id": "C0025202" 292 | } 293 | ], 294 | "biomarker_eligibility": [ 295 | { 296 | "gene": "BRAF", 297 | "variant": "V600E", 298 | "required": true, 299 | "inclusion": true 300 | } 301 | ], 302 | "arms": [...], 303 | "sites": [...] 304 | } 305 | ``` 306 | 307 | ### Organization Object 308 | 309 | ```json 310 | { 311 | "org_id": "NCI-2021-00123", 312 | "name": "MD Anderson Cancer Center", 313 | "type": "NCI-designated", 314 | "address": { 315 | "city": "Houston", 316 | "state": "TX", 317 | "country": "United States", 318 | "postal_code": "77030" 319 | }, 320 | "contact": { 321 | "name": "Clinical Trials Office", 322 | "phone": "1-800-392-1611", 323 | "email": "[email protected]" 324 | }, 325 | "active_trials_count": 1250 326 | } 327 | ``` 328 | 329 | ## Error Handling 330 | 331 | ### Common Errors 332 | 333 | #### 401 Unauthorized 334 | 335 | ```json 336 | { 337 | "error": "Invalid or missing API key" 338 | } 339 | ``` 340 | 341 | #### 400 Bad Request 342 | 343 | ```json 344 | { 345 | "error": "Invalid parameter combination", 346 | "details": "Must specify both city AND state for location search" 347 | } 348 | ``` 349 | 350 | #### 429 Rate Limited 351 | 352 | ```json 353 | { 354 | "error": "Rate limit exceeded", 355 | "retry_after": 3600 356 | } 357 | ``` 358 | 359 | ### Best Practices 360 | 361 | 1. **Always use city AND state together** for location searches 362 | 2. **Handle missing totals** - the API may not return total counts with size parameter 363 | 3. **Use specific searches** - broad queries may timeout 364 | 4. **Implement retry logic** for rate limits 365 | 366 | ## Rate Limits 367 | 368 | - **With API Key**: 1,000 requests/day 369 | - **Burst Rate**: 10 requests/second 370 | - **Without Key**: Not supported 371 | 372 | ## Differences from ClinicalTrials.gov 373 | 374 | ### Enhanced Features 375 | 376 | - **Biomarker search**: Mutation-specific queries 377 | - **Prior therapy**: Exclude based on previous treatments 378 | - **Brain metastases**: Specific acceptance criteria 379 | - **Performance status**: ECOG/Karnofsky filtering 380 | 381 | ### Limitations 382 | 383 | - **Cancer trials only**: Limited to oncology studies 384 | - **No offset pagination**: Must use size parameter carefully 385 | - **Location parameters**: Different naming (org\_ prefix) 386 | 387 | ## Integration Examples 388 | 389 | ### Example 1: Precision Medicine Search 390 | 391 | ```python 392 | async def find_precision_trials(mutation, cancer_type, location): 393 | """Find trials for specific mutation in cancer type near location""" 394 | 395 | # Search for trials 396 | trial_params = { 397 | "diseases": cancer_type, 398 | "biomarkers": mutation, 399 | "accepts_brain_mets": True, 400 | "size": 50 401 | } 402 | 403 | trials = await fetch_nci_api("trials", trial_params) 404 | 405 | # Filter by location if provided 406 | if location: 407 | nearby_trials = [] 408 | for trial in trials["trials"]: 409 | for site in trial.get("sites", []): 410 | distance = calculate_distance(location, site["coordinates"]) 411 | if distance < 100: # 100 miles 412 | nearby_trials.append(trial) 413 | break 414 | 415 | return nearby_trials 416 | 417 | return trials["trials"] 418 | ``` 419 | 420 | ### Example 2: Biomarker-Driven Pipeline 421 | 422 | ```python 423 | def biomarker_trial_pipeline(gene, variant): 424 | """Complete pipeline from variant to trials""" 425 | 426 | # 1. Search biomarkers 427 | biomarkers = requests.get( 428 | "https://clinicaltrialsapi.cancer.gov/api/v2/biomarkers", 429 | headers={"X-API-KEY": api_key}, 430 | params={"gene": gene, "name": variant} 431 | ).json() 432 | 433 | # 2. Get associated trials 434 | all_trials = [] 435 | for biomarker in biomarkers.get("biomarkers", []): 436 | trials = requests.get( 437 | "https://clinicaltrialsapi.cancer.gov/api/v2/trials", 438 | headers={"X-API-KEY": api_key}, 439 | params={"biomarker_id": biomarker["id"]} 440 | ).json() 441 | all_trials.extend(trials.get("trials", [])) 442 | 443 | # 3. Deduplicate and sort by phase 444 | unique_trials = {t["nct_id"]: t for t in all_trials}.values() 445 | return sorted(unique_trials, key=lambda x: x.get("phase", "")) 446 | ``` 447 | 448 | ## Support Resources 449 | 450 | - **API Documentation**: [https://clinicaltrialsapi.cancer.gov/](https://clinicaltrialsapi.cancer.gov/) 451 | - **Support Email**: [email protected] 452 | - **Status Page**: [https://status.cancer.gov/](https://status.cancer.gov/) 453 | - **Terms of Use**: [https://clinicaltrialsapi.cancer.gov/terms](https://clinicaltrialsapi.cancer.gov/terms) 454 | ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/drug_approvals.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | OpenFDA drug approvals (Drugs@FDA) integration. 3 | """ 4 | 5 | import logging 6 | from typing import Any 7 | 8 | from .constants import ( 9 | OPENFDA_DEFAULT_LIMIT, 10 | OPENFDA_DISCLAIMER, 11 | OPENFDA_DRUGSFDA_URL, 12 | ) 13 | from .utils import ( 14 | format_count, 15 | make_openfda_request, 16 | ) 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | async def search_drug_approvals( 22 | drug: str | None = None, 23 | application_number: str | None = None, 24 | approval_year: str | None = None, 25 | limit: int = OPENFDA_DEFAULT_LIMIT, 26 | skip: int = 0, 27 | api_key: str | None = None, 28 | ) -> str: 29 | """ 30 | Search FDA drug approval records from Drugs@FDA. 31 | 32 | Args: 33 | drug: Drug name (brand or generic) to search for 34 | application_number: NDA or BLA application number 35 | approval_year: Year of approval (YYYY format) 36 | limit: Maximum number of results to return 37 | skip: Number of results to skip (for pagination) 38 | 39 | api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) 40 | 41 | Returns: 42 | Formatted string with drug approval information 43 | """ 44 | # Build search query 45 | search_params = {} 46 | 47 | if drug: 48 | # Search both brand and generic names 49 | search_params["search"] = ( 50 | f'(openfda.brand_name:"{drug}" OR ' 51 | f'openfda.generic_name:"{drug}" OR ' 52 | f'openfda.substance_name:"{drug}")' 53 | ) 54 | elif application_number: 55 | search_params["search"] = f'application_number:"{application_number}"' 56 | elif approval_year: 57 | # Search for approvals in a specific year 58 | search_params["search"] = ( 59 | f"products.marketing_status_date:[{approval_year}-01-01 TO {approval_year}-12-31]" 60 | ) 61 | 62 | # Add pagination 63 | search_params["limit"] = str(min(limit, 100)) 64 | search_params["skip"] = str(skip) 65 | 66 | # Sort by submission date (most recent first) 67 | search_params["sort"] = "submissions.submission_status_date:desc" 68 | 69 | # Make the request 70 | response, error = await make_openfda_request( 71 | OPENFDA_DRUGSFDA_URL, search_params, "openfda_approvals", api_key 72 | ) 73 | 74 | if error: 75 | return f"⚠️ Error searching drug approvals: {error}" 76 | 77 | if not response or not response.get("results"): 78 | return "No drug approval records found matching your criteria." 79 | 80 | # Format the results 81 | results = response["results"] 82 | total = ( 83 | response.get("meta", {}).get("results", {}).get("total", len(results)) 84 | ) 85 | 86 | output = ["## FDA Drug Approval Records\n"] 87 | 88 | if drug: 89 | output.append(f"**Drug**: {drug}") 90 | if application_number: 91 | output.append(f"**Application**: {application_number}") 92 | if approval_year: 93 | output.append(f"**Approval Year**: {approval_year}") 94 | 95 | output.append( 96 | f"**Total Records Found**: {format_count(total, 'record')}\n" 97 | ) 98 | 99 | # Show results 100 | output.append(f"### Results (showing {len(results)} of {total}):\n") 101 | 102 | for i, record in enumerate(results, 1): 103 | output.extend(_format_approval_summary(record, i)) 104 | 105 | output.append(f"\n{OPENFDA_DISCLAIMER}") 106 | 107 | return "\n".join(output) 108 | 109 | 110 | async def get_drug_approval( 111 | application_number: str, 112 | api_key: str | None = None, 113 | ) -> str: 114 | """ 115 | Get detailed drug approval information for a specific application. 116 | 117 | Args: 118 | application_number: NDA or BLA application number 119 | 120 | api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) 121 | 122 | Returns: 123 | Formatted string with detailed approval information 124 | """ 125 | # Search for the specific application 126 | search_params = { 127 | "search": f'application_number:"{application_number}"', 128 | "limit": 1, 129 | } 130 | 131 | response, error = await make_openfda_request( 132 | OPENFDA_DRUGSFDA_URL, search_params, "openfda_approvals", api_key 133 | ) 134 | 135 | if error: 136 | return f"⚠️ Error retrieving drug approval: {error}" 137 | 138 | if not response or not response.get("results"): 139 | return f"No approval record found for application {application_number}" 140 | 141 | record = response["results"][0] 142 | 143 | # Format detailed approval information 144 | output = [f"## Drug Approval Details: {application_number}\n"] 145 | 146 | # Basic information 147 | output.extend(_format_approval_header(record)) 148 | 149 | # Products 150 | if products := record.get("products"): 151 | output.extend(_format_products(products)) 152 | 153 | # Submissions history 154 | if submissions := record.get("submissions"): 155 | output.extend(_format_submissions(submissions)) 156 | 157 | # OpenFDA metadata 158 | if openfda := record.get("openfda"): 159 | output.extend(_format_openfda_metadata(openfda)) 160 | 161 | output.append(f"\n{OPENFDA_DISCLAIMER}") 162 | 163 | return "\n".join(output) 164 | 165 | 166 | def _format_approval_summary(record: dict[str, Any], num: int) -> list[str]: 167 | """Format a single approval record summary.""" 168 | output = [ 169 | f"#### {num}. Application {record.get('application_number', 'Unknown')}" 170 | ] 171 | 172 | # Get sponsor/applicant 173 | if sponsor := record.get("sponsor_name"): 174 | output.append(f"**Sponsor**: {sponsor}") 175 | 176 | # Get drug names from OpenFDA data 177 | openfda = record.get("openfda", {}) 178 | if brand_names := openfda.get("brand_name"): 179 | output.append(f"**Brand Name(s)**: {', '.join(brand_names[:3])}") 180 | if generic_names := openfda.get("generic_name"): 181 | output.append(f"**Generic Name(s)**: {', '.join(generic_names[:3])}") 182 | 183 | # Get products and their approval dates 184 | if products := record.get("products"): 185 | output.append("\n**Products**:") 186 | for prod in products[:3]: 187 | prod_num = prod.get("product_number", "?") 188 | dosage = prod.get("dosage_form", "") 189 | strength = prod.get("strength", "") 190 | status = prod.get("marketing_status", "") 191 | 192 | prod_line = f"- Product {prod_num}: {dosage}" 193 | if strength: 194 | prod_line += f" ({strength})" 195 | if status: 196 | prod_line += f" - {status}" 197 | output.append(prod_line) 198 | 199 | # Get most recent submission 200 | if submissions := record.get("submissions"): 201 | # Sort by date to get most recent 202 | recent = submissions[0] 203 | sub_type = recent.get("submission_type", "") 204 | sub_status = recent.get("submission_status", "") 205 | sub_date = recent.get("submission_status_date", "") 206 | 207 | if sub_date: 208 | output.append( 209 | f"\n**Latest Activity**: {sub_type} - {sub_status} ({sub_date})" 210 | ) 211 | 212 | output.append("") 213 | return output 214 | 215 | 216 | def _format_approval_header(record: dict[str, Any]) -> list[str]: 217 | """Format the header section of detailed approval.""" 218 | output = ["### Application Information"] 219 | 220 | output.append( 221 | f"**Application Number**: {record.get('application_number', 'Unknown')}" 222 | ) 223 | 224 | if sponsor := record.get("sponsor_name"): 225 | output.append(f"**Sponsor**: {sponsor}") 226 | 227 | # OpenFDA names 228 | openfda = record.get("openfda", {}) 229 | if brand_names := openfda.get("brand_name"): 230 | output.append(f"**Brand Names**: {', '.join(brand_names)}") 231 | if generic_names := openfda.get("generic_name"): 232 | output.append(f"**Generic Names**: {', '.join(generic_names)}") 233 | if substances := openfda.get("substance_name"): 234 | output.append(f"**Active Substances**: {', '.join(substances)}") 235 | 236 | output.append("") 237 | return output 238 | 239 | 240 | def _format_products(products: list[dict[str, Any]]) -> list[str]: 241 | """Format product information.""" 242 | output = ["### Products"] 243 | 244 | for prod in products: 245 | prod_num = prod.get("product_number", "Unknown") 246 | output.append(f"\n#### Product {prod_num}") 247 | 248 | if dosage := prod.get("dosage_form"): 249 | output.append(f"**Dosage Form**: {dosage}") 250 | if strength := prod.get("strength"): 251 | output.append(f"**Strength**: {strength}") 252 | if route := prod.get("route"): 253 | output.append(f"**Route**: {route}") 254 | if status := prod.get("marketing_status"): 255 | output.append(f"**Marketing Status**: {status}") 256 | if status_date := prod.get("marketing_status_date"): 257 | output.append(f"**Status Date**: {status_date}") 258 | if te_code := prod.get("te_code"): 259 | output.append(f"**Therapeutic Equivalence**: {te_code}") 260 | 261 | output.append("") 262 | return output 263 | 264 | 265 | def _format_submissions(submissions: list[dict[str, Any]]) -> list[str]: 266 | """Format submission history.""" 267 | output = ["### Submission History"] 268 | 269 | # Show most recent 5 submissions 270 | for sub in submissions[:5]: 271 | sub_num = sub.get("submission_number", "?") 272 | sub_type = sub.get("submission_type", "Unknown") 273 | sub_status = sub.get("submission_status", "") 274 | sub_date = sub.get("submission_status_date", "") 275 | 276 | output.append(f"\n**Submission {sub_num}**: {sub_type}") 277 | if sub_status: 278 | output.append(f"- Status: {sub_status}") 279 | if sub_date: 280 | output.append(f"- Date: {sub_date}") 281 | 282 | # Review priority if present 283 | if priority := sub.get("review_priority"): 284 | output.append(f"- Review Priority: {priority}") 285 | 286 | # Submission class if present 287 | if sub_class := sub.get("submission_class_code"): 288 | class_desc = sub.get("submission_class_code_description", "") 289 | output.append(f"- Class: {sub_class} - {class_desc}") 290 | 291 | output.append("") 292 | return output 293 | 294 | 295 | def _format_openfda_metadata(openfda: dict[str, Any]) -> list[str]: 296 | """Format OpenFDA metadata.""" 297 | output = ["### Additional Information"] 298 | 299 | if nui := openfda.get("nui"): 300 | output.append(f"**NUI Codes**: {', '.join(nui[:5])}") 301 | 302 | if pharm_class := openfda.get("pharm_class_epc"): 303 | output.append(f"**Pharmacologic Class**: {', '.join(pharm_class[:3])}") 304 | 305 | if moa := openfda.get("pharm_class_moa"): 306 | output.append(f"**Mechanism of Action**: {', '.join(moa[:3])}") 307 | 308 | if unii := openfda.get("unii"): 309 | output.append(f"**UNII Codes**: {', '.join(unii[:5])}") 310 | 311 | output.append("") 312 | return output 313 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_alphagenome_comprehensive.py: -------------------------------------------------------------------------------- ```python 1 | """Comprehensive tests for AlphaGenome integration.""" 2 | 3 | from unittest.mock import MagicMock, patch 4 | 5 | import pandas as pd 6 | import pytest 7 | 8 | from biomcp.variants.alphagenome import ( 9 | _validate_inputs, 10 | predict_variant_effects, 11 | ) 12 | 13 | 14 | class TestInputValidation: 15 | """Test input validation for AlphaGenome.""" 16 | 17 | def test_valid_chromosomes(self): 18 | """Test validation accepts valid chromosome formats.""" 19 | valid_chroms = ["chr1", "chr22", "chrX", "chrY", "chrM", "chrMT"] 20 | for chrom in valid_chroms: 21 | # Should not raise 22 | _validate_inputs(chrom, 100, "A", "T") 23 | 24 | def test_invalid_chromosomes(self): 25 | """Test validation rejects invalid chromosome formats.""" 26 | invalid_chroms = ["1", "chr23", "chrZ", "chromosome1", "Chr1", ""] 27 | for chrom in invalid_chroms: 28 | with pytest.raises(ValueError, match="Invalid chromosome format"): 29 | _validate_inputs(chrom, 100, "A", "T") 30 | 31 | def test_invalid_position(self): 32 | """Test validation rejects invalid positions.""" 33 | with pytest.raises(ValueError, match="Position must be >= 1"): 34 | _validate_inputs("chr1", 0, "A", "T") 35 | with pytest.raises(ValueError, match="Position must be >= 1"): 36 | _validate_inputs("chr1", -10, "A", "T") 37 | 38 | def test_valid_nucleotides(self): 39 | """Test validation accepts valid nucleotides.""" 40 | valid_cases = [ 41 | ("A", "T"), 42 | ("C", "G"), 43 | ("ACGT", "TGCA"), 44 | ("a", "t"), 45 | ("acgt", "tgca"), # lowercase should work 46 | ] 47 | for ref, alt in valid_cases: 48 | # Should not raise 49 | _validate_inputs("chr1", 100, ref, alt) 50 | 51 | def test_invalid_nucleotides(self): 52 | """Test validation rejects invalid nucleotides.""" 53 | invalid_cases = [("N", "A"), ("A", "U"), ("AXG", "T"), ("A", "123")] 54 | for ref, alt in invalid_cases: 55 | with pytest.raises(ValueError, match="Invalid nucleotides"): 56 | _validate_inputs("chr1", 100, ref, alt) 57 | 58 | def test_empty_alleles(self): 59 | """Test validation rejects empty alleles.""" 60 | with pytest.raises( 61 | ValueError, match="Reference allele cannot be empty" 62 | ): 63 | _validate_inputs("chr1", 100, "", "A") 64 | with pytest.raises( 65 | ValueError, match="Alternate allele cannot be empty" 66 | ): 67 | _validate_inputs("chr1", 100, "A", "") 68 | 69 | 70 | class TestIntervalSizeCalculation: 71 | """Test interval size selection logic.""" 72 | 73 | @pytest.mark.asyncio 74 | async def test_interval_size_edge_cases(self): 75 | """Test interval size selection for edge cases.""" 76 | with patch.dict("os.environ", {}, clear=True): 77 | # Without API key, we should get early return 78 | result = await predict_variant_effects( 79 | chromosome="chr1", 80 | position=100, 81 | reference="A", 82 | alternate="T", 83 | interval_size=2000000, # Larger than max 84 | ) 85 | assert "AlphaGenome API key required" in result 86 | 87 | 88 | class TestCaching: 89 | """Test caching behavior.""" 90 | 91 | @pytest.mark.asyncio 92 | async def test_skip_cache_parameter(self): 93 | """Test that skip_cache parameter works.""" 94 | with patch.dict("os.environ", {}, clear=True): 95 | # First call 96 | result1 = await predict_variant_effects( 97 | chromosome="chr1", 98 | position=100, 99 | reference="A", 100 | alternate="T", 101 | skip_cache=True, 102 | ) 103 | 104 | # Second call with skip_cache 105 | result2 = await predict_variant_effects( 106 | chromosome="chr1", 107 | position=100, 108 | reference="A", 109 | alternate="T", 110 | skip_cache=True, 111 | ) 112 | 113 | # Both should show API key error 114 | assert "AlphaGenome API key required" in result1 115 | assert "AlphaGenome API key required" in result2 116 | 117 | 118 | class TestErrorHandling: 119 | """Test error handling and context.""" 120 | 121 | @pytest.mark.asyncio 122 | async def test_error_context_with_api_key(self): 123 | """Test that errors include proper context.""" 124 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): 125 | result = await predict_variant_effects( 126 | chromosome="chr1", 127 | position=100, 128 | reference="A", 129 | alternate="T", 130 | tissue_types=["UBERON:0002367"], 131 | skip_cache=True, 132 | ) 133 | 134 | # Should either get import error or API error with context 135 | if "AlphaGenome prediction failed" in result: 136 | assert "Context:" in result 137 | assert "chr1:100 A>T" in result 138 | assert "Tissue types:" in result 139 | 140 | @pytest.mark.asyncio 141 | async def test_input_validation_errors(self): 142 | """Test that input validation errors are raised.""" 143 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): 144 | # Invalid chromosome 145 | with pytest.raises(ValueError, match="Invalid chromosome format"): 146 | await predict_variant_effects( 147 | chromosome="invalid", 148 | position=100, 149 | reference="A", 150 | alternate="T", 151 | ) 152 | 153 | # Invalid nucleotides 154 | with pytest.raises(ValueError, match="Invalid nucleotides"): 155 | await predict_variant_effects( 156 | chromosome="chr1", 157 | position=100, 158 | reference="X", 159 | alternate="T", 160 | ) 161 | 162 | 163 | class TestThresholdParameter: 164 | """Test significance threshold parameter.""" 165 | 166 | @pytest.mark.asyncio 167 | async def test_custom_threshold(self): 168 | """Test that custom threshold is accepted.""" 169 | with patch.dict("os.environ", {}, clear=True): 170 | # Test with custom threshold 171 | result = await predict_variant_effects( 172 | chromosome="chr1", 173 | position=100, 174 | reference="A", 175 | alternate="T", 176 | significance_threshold=0.8, 177 | ) 178 | 179 | # Should work (get API key error, not parameter error) 180 | assert "AlphaGenome API key required" in result 181 | 182 | @pytest.mark.asyncio 183 | async def test_default_threshold(self): 184 | """Test that default threshold is used.""" 185 | with patch.dict("os.environ", {}, clear=True): 186 | # Test without threshold parameter 187 | result = await predict_variant_effects( 188 | chromosome="chr1", 189 | position=100, 190 | reference="A", 191 | alternate="T", 192 | ) 193 | 194 | # Should work with default 195 | assert "AlphaGenome API key required" in result 196 | 197 | 198 | class TestIntegration: 199 | """Integration tests with mocked AlphaGenome.""" 200 | 201 | @pytest.mark.asyncio 202 | async def test_successful_prediction_mock(self): 203 | """Test successful prediction with mocked AlphaGenome.""" 204 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): 205 | # Mock the AlphaGenome imports 206 | mock_genome = MagicMock() 207 | mock_dna_client = MagicMock() 208 | mock_variant_scorers = MagicMock() 209 | 210 | # Mock the model 211 | mock_model = MagicMock() 212 | mock_dna_client.create.return_value = mock_model 213 | 214 | # Mock scorers 215 | mock_variant_scorers.get_recommended_scorers.return_value = [ 216 | "scorer1" 217 | ] 218 | 219 | # Mock scores DataFrame 220 | mock_df = pd.DataFrame({ 221 | "output_type": ["RNA_SEQ"], 222 | "raw_score": [1.0], 223 | "gene_name": ["GENE1"], 224 | "track_name": ["tissue1"], 225 | }) 226 | mock_variant_scorers.tidy_scores.return_value = mock_df 227 | 228 | # Mock score_variant to return mock scores 229 | mock_model.score_variant.return_value = [MagicMock()] 230 | 231 | # Patch the imports 232 | with patch.dict( 233 | "sys.modules", 234 | { 235 | "alphagenome.data.genome": mock_genome, 236 | "alphagenome.models.dna_client": mock_dna_client, 237 | "alphagenome.models.variant_scorers": mock_variant_scorers, 238 | "alphagenome.data": MagicMock(genome=mock_genome), 239 | "alphagenome.models": MagicMock( 240 | dna_client=mock_dna_client, 241 | variant_scorers=mock_variant_scorers, 242 | ), 243 | }, 244 | ): 245 | result = await predict_variant_effects( 246 | chromosome="chr7", 247 | position=140753336, 248 | reference="A", 249 | alternate="T", 250 | interval_size=131072, 251 | skip_cache=True, 252 | ) 253 | 254 | # Check model was created with API key 255 | mock_dna_client.create.assert_called_once_with("test-key") 256 | 257 | # Check interval was created correctly 258 | mock_genome.Interval.assert_called_once() 259 | call_args = mock_genome.Interval.call_args 260 | assert ( 261 | call_args[1]["start"] == 140753336 - 65536 - 1 262 | ) # 0-based 263 | assert call_args[1]["end"] == call_args[1]["start"] + 131072 264 | 265 | # Check variant was created 266 | mock_genome.Variant.assert_called_once_with( 267 | chromosome="chr7", 268 | position=140753336, 269 | reference_bases="A", 270 | alternate_bases="T", 271 | ) 272 | 273 | # Check result contains expected formatting 274 | assert "AlphaGenome Variant Effect Predictions" in result 275 | assert "Gene Expression" in result 276 | assert "GENE1" in result 277 | ``` -------------------------------------------------------------------------------- /src/biomcp/trials/getter.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import logging 3 | from ssl import TLSVersion 4 | from typing import Annotated, Any 5 | 6 | from .. import StrEnum, http_client, render 7 | from ..constants import CLINICAL_TRIALS_BASE_URL 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class Module(StrEnum): 13 | PROTOCOL = "Protocol" 14 | LOCATIONS = "Locations" 15 | REFERENCES = "References" 16 | OUTCOMES = "Outcomes" 17 | ALL = "All" 18 | 19 | 20 | modules: dict[Module, list[str]] = { 21 | Module.PROTOCOL: [ 22 | "IdentificationModule", 23 | "StatusModule", 24 | "SponsorCollaboratorsModule", 25 | "OversightModule", 26 | "DescriptionModule", 27 | "ConditionsModule", 28 | "DesignModule", 29 | "ArmsInterventionsModule", 30 | "EligibilityModule", 31 | ], 32 | Module.LOCATIONS: ["ContactsLocationsModule"], 33 | Module.REFERENCES: ["ReferencesModule"], 34 | Module.OUTCOMES: ["OutcomesModule", "ResultsSection"], 35 | Module.ALL: [ 36 | "IdentificationModule", 37 | "StatusModule", 38 | "SponsorCollaboratorsModule", 39 | "OversightModule", 40 | "DescriptionModule", 41 | "ConditionsModule", 42 | "DesignModule", 43 | "ArmsInterventionsModule", 44 | "EligibilityModule", 45 | "ContactsLocationsModule", 46 | "ReferencesModule", 47 | "OutcomesModule", 48 | "ResultsSection", 49 | ], 50 | } 51 | 52 | 53 | async def get_trial( 54 | nct_id: str, 55 | module: Module = Module.PROTOCOL, 56 | output_json: bool = False, 57 | ) -> str: 58 | """Get details of a clinical trial by module.""" 59 | fields = ",".join(modules[module]) 60 | params = {"fields": fields} 61 | url = f"{CLINICAL_TRIALS_BASE_URL}/{nct_id}" 62 | 63 | logger.debug(f"Fetching trial {nct_id} with module {module.value}") 64 | logger.debug(f"URL: {url}, Params: {params}") 65 | 66 | parsed_data: dict[str, Any] | None 67 | error_obj: http_client.RequestError | None 68 | parsed_data, error_obj = await http_client.request_api( 69 | url=url, 70 | request=params, 71 | method="GET", 72 | tls_version=TLSVersion.TLSv1_2, 73 | response_model_type=None, 74 | domain="clinicaltrials", 75 | ) 76 | 77 | data_to_return: dict[str, Any] 78 | 79 | if error_obj: 80 | logger.error( 81 | f"API Error for {nct_id}: {error_obj.code} - {error_obj.message}" 82 | ) 83 | data_to_return = { 84 | "error": f"API Error {error_obj.code}", 85 | "details": error_obj.message, 86 | } 87 | elif parsed_data: 88 | # ClinicalTrials.gov API returns data wrapped in a "studies" array 89 | # Extract the first study if it exists 90 | if isinstance(parsed_data, dict) and "studies" in parsed_data: 91 | studies = parsed_data.get("studies", []) 92 | if studies and len(studies) > 0: 93 | data_to_return = studies[0] 94 | data_to_return["URL"] = ( 95 | f"https://clinicaltrials.gov/study/{nct_id}" 96 | ) 97 | else: 98 | logger.warning(f"No studies found in response for {nct_id}") 99 | data_to_return = { 100 | "error": f"No studies found for {nct_id}", 101 | "details": "API returned empty studies array", 102 | } 103 | else: 104 | # Handle case where API returns data in unexpected format 105 | logger.debug( 106 | f"Unexpected response format for {nct_id}: {type(parsed_data)}" 107 | ) 108 | data_to_return = parsed_data 109 | data_to_return["URL"] = ( 110 | f"https://clinicaltrials.gov/study/{nct_id}" 111 | ) 112 | else: 113 | logger.warning( 114 | f"No data received for {nct_id} with module {module.value}" 115 | ) 116 | data_to_return = { 117 | "error": f"No data found for {nct_id} with module {module.value}", 118 | "details": "API returned no data", 119 | } 120 | 121 | if output_json: 122 | return json.dumps(data_to_return, indent=2) 123 | else: 124 | return render.to_markdown(data_to_return) 125 | 126 | 127 | async def _trial_protocol( 128 | call_benefit: Annotated[ 129 | str, 130 | "Define and summarize why this function is being called and the intended benefit", 131 | ], 132 | nct_id: str, 133 | ): 134 | """ 135 | Retrieves core protocol information for a single clinical 136 | trial identified by its NCT ID. 137 | 138 | Parameters: 139 | - call_benefit: Define and summarize why this function is being called and the intended benefit 140 | - nct_id: A single NCT ID (string, e.g., "NCT04280705") 141 | 142 | Process: Fetches standard "Protocol" view modules (like ID, 143 | Status, Sponsor, Design, Eligibility) from the 144 | ClinicalTrials.gov v2 API. 145 | Output: A Markdown formatted string detailing title, status, 146 | sponsor, purpose, study design, phase, interventions, 147 | eligibility criteria, etc. Returns error if invalid. 148 | """ 149 | return await get_trial(nct_id, Module.PROTOCOL) 150 | 151 | 152 | async def _trial_locations( 153 | call_benefit: Annotated[ 154 | str, 155 | "Define and summarize why this function is being called and the intended benefit", 156 | ], 157 | nct_id: str, 158 | ) -> str: 159 | """ 160 | Retrieves contact and location details for a single 161 | clinical trial identified by its NCT ID. 162 | 163 | Parameters: 164 | - call_benefit: Define and summarize why this function is being called and the intended benefit 165 | - nct_id: A single NCT ID (string, e.g., "NCT04280705") 166 | 167 | Process: Fetches the `ContactsLocationsModule` from the 168 | ClinicalTrials.gov v2 API for the given NCT ID. 169 | Output: A Markdown formatted string detailing facility names, 170 | addresses (city, state, country), and contact info. 171 | Returns an error message if the NCT ID is invalid. 172 | """ 173 | return await get_trial(nct_id, Module.LOCATIONS) 174 | 175 | 176 | async def _trial_outcomes( 177 | call_benefit: Annotated[ 178 | str, 179 | "Define and summarize why this function is being called and the intended benefit", 180 | ], 181 | nct_id: str, 182 | ) -> str: 183 | """ 184 | Retrieves outcome measures, results (if available), and 185 | adverse event data for a single clinical trial. 186 | 187 | Parameters: 188 | - call_benefit: Define and summarize why this function is being called and the intended benefit 189 | - nct_id: A single NCT ID (string, e.g., "NCT04280705") 190 | 191 | Process: Fetches the `OutcomesModule` and `ResultsSection` 192 | from the ClinicalTrials.gov v2 API for the NCT ID. 193 | Output: A Markdown formatted string detailing primary/secondary 194 | outcomes, participant flow, results tables (if posted), 195 | and adverse event summaries. Returns an error if invalid. 196 | """ 197 | return await get_trial(nct_id, Module.OUTCOMES) 198 | 199 | 200 | async def _trial_references( 201 | call_benefit: Annotated[ 202 | str, 203 | "Define and summarize why this function is being called and the intended benefit", 204 | ], 205 | nct_id: str, 206 | ): 207 | """ 208 | Retrieves publications and other references associated with 209 | a single clinical trial identified by its NCT ID. 210 | 211 | Parameters: 212 | - call_benefit: Define and summarize why this function is being called and the intended benefit 213 | - nct_id: A single NCT ID (string, e.g., "NCT04280705") 214 | 215 | Process: Fetches the `ReferencesModule` from the 216 | ClinicalTrials.gov v2 API for the NCT ID. 217 | Output: A Markdown formatted string listing citations, 218 | associated PubMed IDs (PMIDs), and reference types 219 | (e.g., result publication). Returns error if invalid. 220 | """ 221 | return await get_trial(nct_id, Module.REFERENCES) 222 | 223 | 224 | async def get_trial_unified( 225 | nct_id: str, 226 | source: str = "clinicaltrials", 227 | api_key: str | None = None, 228 | sections: list[str] | None = None, 229 | ) -> str: 230 | """ 231 | Get trial details from either ClinicalTrials.gov or NCI CTS API. 232 | 233 | Args: 234 | nct_id: NCT identifier (e.g., "NCT04280705") 235 | source: Data source - "clinicaltrials" (default) or "nci" 236 | api_key: API key for NCI (required if source="nci") 237 | sections: List of sections to include (for clinicaltrials.gov) 238 | Options: ["protocol", "locations", "outcomes", "references", "all"] 239 | 240 | Returns: 241 | Formatted markdown string with trial details 242 | """ 243 | if source == "nci": 244 | # Import here to avoid circular imports 245 | from .nci_getter import format_nci_trial_details, get_trial_nci 246 | 247 | trial_data = await get_trial_nci(nct_id, api_key) 248 | return await format_nci_trial_details(trial_data, api_key) 249 | else: 250 | # Default to ClinicalTrials.gov 251 | if sections and "all" in sections: 252 | return await get_trial(nct_id, Module.ALL) 253 | elif sections: 254 | # Get specific sections 255 | results = [] 256 | for section in sections: 257 | if section == "protocol": 258 | results.append( 259 | await _trial_protocol( 260 | call_benefit=f"Getting protocol information for trial {nct_id}", 261 | nct_id=nct_id, 262 | ) 263 | ) 264 | elif section == "locations": 265 | results.append( 266 | await _trial_locations( 267 | call_benefit=f"Getting locations for trial {nct_id}", 268 | nct_id=nct_id, 269 | ) 270 | ) 271 | elif section == "outcomes": 272 | results.append( 273 | await _trial_outcomes( 274 | call_benefit=f"Getting outcomes for trial {nct_id}", 275 | nct_id=nct_id, 276 | ) 277 | ) 278 | elif section == "references": 279 | results.append( 280 | await _trial_references( 281 | call_benefit=f"Getting references for trial {nct_id}", 282 | nct_id=nct_id, 283 | ) 284 | ) 285 | return "\n\n---\n\n".join(results) 286 | else: 287 | # Default to protocol only 288 | return await _trial_protocol( 289 | call_benefit=f"Getting trial protocol details for {nct_id}", 290 | nct_id=nct_id, 291 | ) 292 | ``` -------------------------------------------------------------------------------- /src/biomcp/biomarkers/search.py: -------------------------------------------------------------------------------- ```python 1 | """Search functionality for biomarkers via NCI CTS API. 2 | 3 | Note: Biomarker data availability may be limited in CTRP. 4 | This module focuses on biomarkers used in trial eligibility criteria. 5 | """ 6 | 7 | import logging 8 | from typing import Any 9 | 10 | from ..constants import NCI_BIOMARKERS_URL 11 | from ..integrations.cts_api import CTSAPIError, make_cts_request 12 | from ..utils import parse_or_query 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def _build_biomarker_params( 18 | name: str | None, 19 | eligibility_criterion: str | None, 20 | biomarker_type: str | None, 21 | codes: list[str] | None, 22 | assay_purpose: str | None, 23 | include: list[str] | None, 24 | sort: str | None, 25 | order: str | None, 26 | page_size: int, 27 | ) -> dict[str, Any]: 28 | """Build query parameters for biomarker search.""" 29 | params: dict[str, Any] = {"size": page_size} 30 | 31 | # Add search filters with correct API parameter names 32 | if name: 33 | params["name"] = name 34 | if eligibility_criterion: 35 | params["eligibility_criterion"] = eligibility_criterion 36 | if biomarker_type: 37 | params["type"] = biomarker_type 38 | if codes: 39 | params["codes"] = ",".join(codes) if isinstance(codes, list) else codes 40 | if assay_purpose: 41 | params["assay_purpose"] = assay_purpose 42 | if include: 43 | params["include"] = ( 44 | ",".join(include) if isinstance(include, list) else include 45 | ) 46 | if sort: 47 | params["sort"] = sort 48 | if order: 49 | params["order"] = order.lower() 50 | 51 | return params 52 | 53 | 54 | def _process_biomarker_response( 55 | response: dict[str, Any], 56 | page: int, 57 | page_size: int, 58 | ) -> dict[str, Any]: 59 | """Process biomarker API response.""" 60 | biomarkers = response.get("data", response.get("biomarkers", [])) 61 | total = response.get("total", len(biomarkers)) 62 | 63 | result = { 64 | "biomarkers": biomarkers, 65 | "total": total, 66 | "page": page, 67 | "page_size": page_size, 68 | } 69 | 70 | # Add note about data limitations if response indicates it 71 | if response.get("limited_data") or not biomarkers: 72 | result["note"] = ( 73 | "Biomarker data availability is limited in CTRP. " 74 | "Results show biomarkers referenced in trial eligibility criteria. " 75 | "For detailed variant annotations, use variant_searcher with MyVariant.info." 76 | ) 77 | 78 | return result 79 | 80 | 81 | async def search_biomarkers( 82 | name: str | None = None, 83 | eligibility_criterion: str | None = None, 84 | biomarker_type: str | None = None, 85 | codes: list[str] | None = None, 86 | assay_purpose: str | None = None, 87 | include: list[str] | None = None, 88 | sort: str | None = None, 89 | order: str | None = None, 90 | page_size: int = 20, 91 | page: int = 1, 92 | api_key: str | None = None, 93 | ) -> dict[str, Any]: 94 | """ 95 | Search for biomarkers in the NCI CTS database. 96 | 97 | Note: Biomarker data availability may be limited per CTRP documentation. 98 | Results focus on biomarkers used in clinical trial eligibility criteria. 99 | 100 | Args: 101 | name: Biomarker name to search for (e.g., "PD-L1", "EGFR mutation") 102 | eligibility_criterion: Eligibility criterion text 103 | biomarker_type: Type of biomarker ("reference_gene" or "branch") 104 | codes: List of biomarker codes 105 | assay_purpose: Purpose of the assay 106 | include: Fields to include in response 107 | sort: Sort field 108 | order: Sort order ('asc' or 'desc') 109 | page_size: Number of results per page 110 | page: Page number 111 | api_key: Optional API key (if not provided, uses NCI_API_KEY env var) 112 | 113 | Returns: 114 | Dictionary with search results containing: 115 | - biomarkers: List of biomarker records 116 | - total: Total number of results 117 | - page: Current page 118 | - page_size: Results per page 119 | - note: Any limitations about the data 120 | 121 | Raises: 122 | CTSAPIError: If the API request fails 123 | """ 124 | # Build query parameters 125 | params = _build_biomarker_params( 126 | name, 127 | eligibility_criterion, 128 | biomarker_type, 129 | codes, 130 | assay_purpose, 131 | include, 132 | sort, 133 | order, 134 | page_size, 135 | ) 136 | 137 | try: 138 | # Make API request 139 | response = await make_cts_request( 140 | url=NCI_BIOMARKERS_URL, 141 | params=params, 142 | api_key=api_key, 143 | ) 144 | 145 | # Process response 146 | return _process_biomarker_response(response, page, page_size) 147 | 148 | except CTSAPIError: 149 | raise 150 | except Exception as e: 151 | logger.error(f"Failed to search biomarkers: {e}") 152 | raise CTSAPIError(f"Biomarker search failed: {e!s}") from e 153 | 154 | 155 | def _format_biomarker_header(total: int, note: str) -> list[str]: 156 | """Format the header section of biomarker results.""" 157 | lines = [ 158 | f"## Biomarker Search Results ({total} found)", 159 | "", 160 | ] 161 | 162 | if note: 163 | lines.extend([ 164 | f"*Note: {note}*", 165 | "", 166 | ]) 167 | 168 | return lines 169 | 170 | 171 | def _format_single_biomarker(biomarker: dict[str, Any]) -> list[str]: 172 | """Format a single biomarker record.""" 173 | bio_id = biomarker.get("id", biomarker.get("biomarker_id", "Unknown")) 174 | name = biomarker.get("name", "Unknown Biomarker") 175 | gene = biomarker.get("gene", biomarker.get("gene_symbol", "")) 176 | bio_type = biomarker.get("type", biomarker.get("category", "")) 177 | 178 | lines = [ 179 | f"### {name}", 180 | f"- **ID**: {bio_id}", 181 | ] 182 | 183 | if gene: 184 | lines.append(f"- **Gene**: {gene}") 185 | if bio_type: 186 | lines.append(f"- **Type**: {bio_type}") 187 | 188 | # Add assay information if available 189 | if biomarker.get("assay_type"): 190 | lines.append(f"- **Assay**: {biomarker['assay_type']}") 191 | 192 | # Add criteria examples if available 193 | if biomarker.get("criteria_examples"): 194 | examples = biomarker["criteria_examples"] 195 | if isinstance(examples, list) and examples: 196 | lines.append("- **Example Criteria**:") 197 | for ex in examples[:3]: # Show up to 3 examples 198 | lines.append(f" - {ex}") 199 | if len(examples) > 3: 200 | lines.append(f" *(and {len(examples) - 3} more)*") 201 | 202 | # Add trial count if available 203 | if biomarker.get("trial_count"): 204 | lines.append( 205 | f"- **Trials Using This Biomarker**: {biomarker['trial_count']}" 206 | ) 207 | 208 | lines.append("") 209 | return lines 210 | 211 | 212 | async def search_biomarkers_with_or( 213 | name_query: str, 214 | eligibility_criterion: str | None = None, 215 | biomarker_type: str | None = None, 216 | codes: list[str] | None = None, 217 | assay_purpose: str | None = None, 218 | include: list[str] | None = None, 219 | sort: str | None = None, 220 | order: str | None = None, 221 | page_size: int = 20, 222 | page: int = 1, 223 | api_key: str | None = None, 224 | ) -> dict[str, Any]: 225 | """ 226 | Search for biomarkers with OR query support. 227 | 228 | This function handles OR queries by making multiple API calls and combining results. 229 | For example: "PD-L1 OR CD274 OR programmed death ligand 1" will search for each term. 230 | 231 | Args: 232 | name_query: Name query that may contain OR operators 233 | Other args same as search_biomarkers 234 | 235 | Returns: 236 | Combined results from all searches with duplicates removed 237 | """ 238 | # Check if this is an OR query 239 | if " OR " in name_query or " or " in name_query: 240 | search_terms = parse_or_query(name_query) 241 | logger.info(f"Parsed OR query into terms: {search_terms}") 242 | else: 243 | # Single term search 244 | search_terms = [name_query] 245 | 246 | # Collect all unique biomarkers 247 | all_biomarkers = {} 248 | total_found = 0 249 | 250 | # Search for each term 251 | for term in search_terms: 252 | logger.info(f"Searching biomarkers for term: {term}") 253 | try: 254 | results = await search_biomarkers( 255 | name=term, 256 | eligibility_criterion=eligibility_criterion, 257 | biomarker_type=biomarker_type, 258 | codes=codes, 259 | assay_purpose=assay_purpose, 260 | include=include, 261 | sort=sort, 262 | order=order, 263 | page_size=page_size, # Get full page size for each term 264 | page=page, 265 | api_key=api_key, 266 | ) 267 | 268 | # Add unique biomarkers (deduplicate by ID) 269 | for biomarker in results.get("biomarkers", []): 270 | bio_id = biomarker.get("id", biomarker.get("biomarker_id")) 271 | if bio_id and bio_id not in all_biomarkers: 272 | all_biomarkers[bio_id] = biomarker 273 | 274 | total_found += results.get("total", 0) 275 | 276 | except Exception as e: 277 | logger.warning(f"Failed to search for term '{term}': {e}") 278 | # Continue with other terms 279 | 280 | # Convert back to list and apply pagination 281 | unique_biomarkers = list(all_biomarkers.values()) 282 | 283 | # Sort if requested (by name by default for consistent results) 284 | if sort == "name" or sort is None: 285 | unique_biomarkers.sort(key=lambda x: x.get("name", "").lower()) 286 | 287 | # Apply pagination to combined results 288 | start_idx = (page - 1) * page_size 289 | end_idx = start_idx + page_size 290 | paginated_biomarkers = unique_biomarkers[start_idx:end_idx] 291 | 292 | return { 293 | "biomarkers": paginated_biomarkers, 294 | "total": len(unique_biomarkers), 295 | "page": page, 296 | "page_size": page_size, 297 | "search_terms": search_terms, # Include what we searched for 298 | "total_found_across_terms": total_found, # Total before deduplication 299 | } 300 | 301 | 302 | def format_biomarker_results(results: dict[str, Any]) -> str: 303 | """ 304 | Format biomarker search results as markdown. 305 | 306 | Args: 307 | results: Search results dictionary 308 | 309 | Returns: 310 | Formatted markdown string 311 | """ 312 | biomarkers = results.get("biomarkers", []) 313 | total = results.get("total", 0) 314 | note = results.get("note", "") 315 | 316 | if not biomarkers: 317 | msg = "No biomarkers found matching the search criteria." 318 | if note: 319 | msg += f"\n\n*Note: {note}*" 320 | return msg 321 | 322 | # Build markdown output 323 | lines = _format_biomarker_header(total, note) 324 | 325 | for biomarker in biomarkers: 326 | lines.extend(_format_single_biomarker(biomarker)) 327 | 328 | return "\n".join(lines) 329 | ```