genomoncology/biomcp # codebase.md

This is page 7 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── actions
│   │   └── setup-python-env
│   │       └── action.yml
│   ├── dependabot.yml
│   └── workflows
│       ├── ci.yml
│       ├── deploy-docs.yml
│       ├── main.yml.disabled
│       ├── on-release-main.yml
│       └── validate-codecov-config.yml
├── .gitignore
├── .pre-commit-config.yaml
├── BIOMCP_DATA_FLOW.md
├── CHANGELOG.md
├── CNAME
├── codecov.yaml
├── docker-compose.yml
├── Dockerfile
├── docs
│   ├── apis
│   │   ├── error-codes.md
│   │   ├── overview.md
│   │   └── python-sdk.md
│   ├── assets
│   │   ├── biomcp-cursor-locations.png
│   │   ├── favicon.ico
│   │   ├── icon.png
│   │   ├── logo.png
│   │   ├── mcp_architecture.txt
│   │   └── remote-connection
│   │       ├── 00_connectors.png
│   │       ├── 01_add_custom_connector.png
│   │       ├── 02_connector_enabled.png
│   │       ├── 03_connect_to_biomcp.png
│   │       ├── 04_select_google_oauth.png
│   │       └── 05_success_connect.png
│   ├── backend-services-reference
│   │   ├── 01-overview.md
│   │   ├── 02-biothings-suite.md
│   │   ├── 03-cbioportal.md
│   │   ├── 04-clinicaltrials-gov.md
│   │   ├── 05-nci-cts-api.md
│   │   ├── 06-pubtator3.md
│   │   └── 07-alphagenome.md
│   ├── blog
│   │   ├── ai-assisted-clinical-trial-search-analysis.md
│   │   ├── images
│   │   │   ├── deep-researcher-video.png
│   │   │   ├── researcher-announce.png
│   │   │   ├── researcher-drop-down.png
│   │   │   ├── researcher-prompt.png
│   │   │   ├── trial-search-assistant.png
│   │   │   └── what_is_biomcp_thumbnail.png
│   │   └── researcher-persona-resource.md
│   ├── changelog.md
│   ├── CNAME
│   ├── concepts
│   │   ├── 01-what-is-biomcp.md
│   │   ├── 02-the-deep-researcher-persona.md
│   │   └── 03-sequential-thinking-with-the-think-tool.md
│   ├── developer-guides
│   │   ├── 01-server-deployment.md
│   │   ├── 02-contributing-and-testing.md
│   │   ├── 03-third-party-endpoints.md
│   │   ├── 04-transport-protocol.md
│   │   ├── 05-error-handling.md
│   │   ├── 06-http-client-and-caching.md
│   │   ├── 07-performance-optimizations.md
│   │   └── generate_endpoints.py
│   ├── faq-condensed.md
│   ├── FDA_SECURITY.md
│   ├── genomoncology.md
│   ├── getting-started
│   │   ├── 01-quickstart-cli.md
│   │   ├── 02-claude-desktop-integration.md
│   │   └── 03-authentication-and-api-keys.md
│   ├── how-to-guides
│   │   ├── 01-find-articles-and-cbioportal-data.md
│   │   ├── 02-find-trials-with-nci-and-biothings.md
│   │   ├── 03-get-comprehensive-variant-annotations.md
│   │   ├── 04-predict-variant-effects-with-alphagenome.md
│   │   ├── 05-logging-and-monitoring-with-bigquery.md
│   │   └── 06-search-nci-organizations-and-interventions.md
│   ├── index.md
│   ├── policies.md
│   ├── reference
│   │   ├── architecture-diagrams.md
│   │   ├── quick-architecture.md
│   │   ├── quick-reference.md
│   │   └── visual-architecture.md
│   ├── robots.txt
│   ├── stylesheets
│   │   ├── announcement.css
│   │   └── extra.css
│   ├── troubleshooting.md
│   ├── tutorials
│   │   ├── biothings-prompts.md
│   │   ├── claude-code-biomcp-alphagenome.md
│   │   ├── nci-prompts.md
│   │   ├── openfda-integration.md
│   │   ├── openfda-prompts.md
│   │   ├── pydantic-ai-integration.md
│   │   └── remote-connection.md
│   ├── user-guides
│   │   ├── 01-command-line-interface.md
│   │   ├── 02-mcp-tools-reference.md
│   │   └── 03-integrating-with-ides-and-clients.md
│   └── workflows
│       └── all-workflows.md
├── example_scripts
│   ├── mcp_integration.py
│   └── python_sdk.py
├── glama.json
├── LICENSE
├── lzyank.toml
├── Makefile
├── mkdocs.yml
├── package-lock.json
├── package.json
├── pyproject.toml
├── README.md
├── scripts
│   ├── check_docs_in_mkdocs.py
│   ├── check_http_imports.py
│   └── generate_endpoints_doc.py
├── smithery.yaml
├── src
│   └── biomcp
│       ├── __init__.py
│       ├── __main__.py
│       ├── articles
│       │   ├── __init__.py
│       │   ├── autocomplete.py
│       │   ├── fetch.py
│       │   ├── preprints.py
│       │   ├── search_optimized.py
│       │   ├── search.py
│       │   └── unified.py
│       ├── biomarkers
│       │   ├── __init__.py
│       │   └── search.py
│       ├── cbioportal_helper.py
│       ├── circuit_breaker.py
│       ├── cli
│       │   ├── __init__.py
│       │   ├── articles.py
│       │   ├── biomarkers.py
│       │   ├── diseases.py
│       │   ├── health.py
│       │   ├── interventions.py
│       │   ├── main.py
│       │   ├── openfda.py
│       │   ├── organizations.py
│       │   ├── server.py
│       │   ├── trials.py
│       │   └── variants.py
│       ├── connection_pool.py
│       ├── constants.py
│       ├── core.py
│       ├── diseases
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   └── search.py
│       ├── domain_handlers.py
│       ├── drugs
│       │   ├── __init__.py
│       │   └── getter.py
│       ├── exceptions.py
│       ├── genes
│       │   ├── __init__.py
│       │   └── getter.py
│       ├── http_client_simple.py
│       ├── http_client.py
│       ├── individual_tools.py
│       ├── integrations
│       │   ├── __init__.py
│       │   ├── biothings_client.py
│       │   └── cts_api.py
│       ├── interventions
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   └── search.py
│       ├── logging_filter.py
│       ├── metrics_handler.py
│       ├── metrics.py
│       ├── openfda
│       │   ├── __init__.py
│       │   ├── adverse_events_helpers.py
│       │   ├── adverse_events.py
│       │   ├── cache.py
│       │   ├── constants.py
│       │   ├── device_events_helpers.py
│       │   ├── device_events.py
│       │   ├── drug_approvals.py
│       │   ├── drug_labels_helpers.py
│       │   ├── drug_labels.py
│       │   ├── drug_recalls_helpers.py
│       │   ├── drug_recalls.py
│       │   ├── drug_shortages_detail_helpers.py
│       │   ├── drug_shortages_helpers.py
│       │   ├── drug_shortages.py
│       │   ├── exceptions.py
│       │   ├── input_validation.py
│       │   ├── rate_limiter.py
│       │   ├── utils.py
│       │   └── validation.py
│       ├── organizations
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   └── search.py
│       ├── parameter_parser.py
│       ├── prefetch.py
│       ├── query_parser.py
│       ├── query_router.py
│       ├── rate_limiter.py
│       ├── render.py
│       ├── request_batcher.py
│       ├── resources
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   ├── instructions.md
│       │   └── researcher.md
│       ├── retry.py
│       ├── router_handlers.py
│       ├── router.py
│       ├── shared_context.py
│       ├── thinking
│       │   ├── __init__.py
│       │   ├── sequential.py
│       │   └── session.py
│       ├── thinking_tool.py
│       ├── thinking_tracker.py
│       ├── trials
│       │   ├── __init__.py
│       │   ├── getter.py
│       │   ├── nci_getter.py
│       │   ├── nci_search.py
│       │   └── search.py
│       ├── utils
│       │   ├── __init__.py
│       │   ├── cancer_types_api.py
│       │   ├── cbio_http_adapter.py
│       │   ├── endpoint_registry.py
│       │   ├── gene_validator.py
│       │   ├── metrics.py
│       │   ├── mutation_filter.py
│       │   ├── query_utils.py
│       │   ├── rate_limiter.py
│       │   └── request_cache.py
│       ├── variants
│       │   ├── __init__.py
│       │   ├── alphagenome.py
│       │   ├── cancer_types.py
│       │   ├── cbio_external_client.py
│       │   ├── cbioportal_mutations.py
│       │   ├── cbioportal_search_helpers.py
│       │   ├── cbioportal_search.py
│       │   ├── constants.py
│       │   ├── external.py
│       │   ├── filters.py
│       │   ├── getter.py
│       │   ├── links.py
│       │   └── search.py
│       └── workers
│           ├── __init__.py
│           ├── worker_entry_stytch.js
│           ├── worker_entry.js
│           └── worker.py
├── tests
│   ├── bdd
│   │   ├── cli_help
│   │   │   ├── help.feature
│   │   │   └── test_help.py
│   │   ├── conftest.py
│   │   ├── features
│   │   │   └── alphagenome_integration.feature
│   │   ├── fetch_articles
│   │   │   ├── fetch.feature
│   │   │   └── test_fetch.py
│   │   ├── get_trials
│   │   │   ├── get.feature
│   │   │   └── test_get.py
│   │   ├── get_variants
│   │   │   ├── get.feature
│   │   │   └── test_get.py
│   │   ├── search_articles
│   │   │   ├── autocomplete.feature
│   │   │   ├── search.feature
│   │   │   ├── test_autocomplete.py
│   │   │   └── test_search.py
│   │   ├── search_trials
│   │   │   ├── search.feature
│   │   │   └── test_search.py
│   │   ├── search_variants
│   │   │   ├── search.feature
│   │   │   └── test_search.py
│   │   └── steps
│   │       └── test_alphagenome_steps.py
│   ├── config
│   │   └── test_smithery_config.py
│   ├── conftest.py
│   ├── data
│   │   ├── ct_gov
│   │   │   ├── clinical_trials_api_v2.yaml
│   │   │   ├── trials_NCT04280705.json
│   │   │   └── trials_NCT04280705.txt
│   │   ├── myvariant
│   │   │   ├── myvariant_api.yaml
│   │   │   ├── myvariant_field_descriptions.csv
│   │   │   ├── variants_full_braf_v600e.json
│   │   │   ├── variants_full_braf_v600e.txt
│   │   │   └── variants_part_braf_v600_multiple.json
│   │   ├── openfda
│   │   │   ├── drugsfda_detail.json
│   │   │   ├── drugsfda_search.json
│   │   │   ├── enforcement_detail.json
│   │   │   └── enforcement_search.json
│   │   └── pubtator
│   │       ├── pubtator_autocomplete.json
│   │       └── pubtator3_paper.txt
│   ├── integration
│   │   ├── test_openfda_integration.py
│   │   ├── test_preprints_integration.py
│   │   ├── test_simple.py
│   │   └── test_variants_integration.py
│   ├── tdd
│   │   ├── articles
│   │   │   ├── test_autocomplete.py
│   │   │   ├── test_cbioportal_integration.py
│   │   │   ├── test_fetch.py
│   │   │   ├── test_preprints.py
│   │   │   ├── test_search.py
│   │   │   └── test_unified.py
│   │   ├── conftest.py
│   │   ├── drugs
│   │   │   ├── __init__.py
│   │   │   └── test_drug_getter.py
│   │   ├── openfda
│   │   │   ├── __init__.py
│   │   │   ├── test_adverse_events.py
│   │   │   ├── test_device_events.py
│   │   │   ├── test_drug_approvals.py
│   │   │   ├── test_drug_labels.py
│   │   │   ├── test_drug_recalls.py
│   │   │   ├── test_drug_shortages.py
│   │   │   └── test_security.py
│   │   ├── test_biothings_integration_real.py
│   │   ├── test_biothings_integration.py
│   │   ├── test_circuit_breaker.py
│   │   ├── test_concurrent_requests.py
│   │   ├── test_connection_pool.py
│   │   ├── test_domain_handlers.py
│   │   ├── test_drug_approvals.py
│   │   ├── test_drug_recalls.py
│   │   ├── test_drug_shortages.py
│   │   ├── test_endpoint_documentation.py
│   │   ├── test_error_scenarios.py
│   │   ├── test_europe_pmc_fetch.py
│   │   ├── test_mcp_integration.py
│   │   ├── test_mcp_tools.py
│   │   ├── test_metrics.py
│   │   ├── test_nci_integration.py
│   │   ├── test_nci_mcp_tools.py
│   │   ├── test_network_policies.py
│   │   ├── test_offline_mode.py
│   │   ├── test_openfda_unified.py
│   │   ├── test_pten_r173_search.py
│   │   ├── test_render.py
│   │   ├── test_request_batcher.py.disabled
│   │   ├── test_retry.py
│   │   ├── test_router.py
│   │   ├── test_shared_context.py.disabled
│   │   ├── test_unified_biothings.py
│   │   ├── thinking
│   │   │   ├── __init__.py
│   │   │   └── test_sequential.py
│   │   ├── trials
│   │   │   ├── test_backward_compatibility.py
│   │   │   ├── test_getter.py
│   │   │   └── test_search.py
│   │   ├── utils
│   │   │   ├── test_gene_validator.py
│   │   │   ├── test_mutation_filter.py
│   │   │   ├── test_rate_limiter.py
│   │   │   └── test_request_cache.py
│   │   ├── variants
│   │   │   ├── constants.py
│   │   │   ├── test_alphagenome_api_key.py
│   │   │   ├── test_alphagenome_comprehensive.py
│   │   │   ├── test_alphagenome.py
│   │   │   ├── test_cbioportal_mutations.py
│   │   │   ├── test_cbioportal_search.py
│   │   │   ├── test_external_integration.py
│   │   │   ├── test_external.py
│   │   │   ├── test_extract_gene_aa_change.py
│   │   │   ├── test_filters.py
│   │   │   ├── test_getter.py
│   │   │   ├── test_links.py
│   │   │   └── test_search.py
│   │   └── workers
│   │       └── test_worker_sanitization.js
│   └── test_pydantic_ai_integration.py
├── THIRD_PARTY_ENDPOINTS.md
├── tox.ini
├── uv.lock
└── wrangler.toml
```

# Files

--------------------------------------------------------------------------------
/src/biomcp/articles/search.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import json
  3 | from collections.abc import Generator
  4 | from typing import Annotated, Any, get_args
  5 | 
  6 | from pydantic import BaseModel, Field, computed_field
  7 | 
  8 | from .. import http_client, render
  9 | from ..constants import PUBTATOR3_SEARCH_URL, SYSTEM_PAGE_SIZE
 10 | from ..core import PublicationState
 11 | from .autocomplete import Concept, EntityRequest, autocomplete
 12 | from .fetch import call_pubtator_api
 13 | 
 14 | concepts: list[Concept] = sorted(get_args(Concept))
 15 | fields: list[str] = [concept + "s" for concept in concepts]
 16 | 
 17 | 
 18 | class PubmedRequest(BaseModel):
 19 |     chemicals: list[str] = Field(
 20 |         default_factory=list,
 21 |         description="List of chemicals for filtering results.",
 22 |     )
 23 |     diseases: list[str] = Field(
 24 |         default_factory=list,
 25 |         description="Diseases such as Hypertension, Lung Adenocarcinoma, etc.",
 26 |     )
 27 |     genes: list[str] = Field(
 28 |         default_factory=list,
 29 |         description="List of genes for filtering results.",
 30 |     )
 31 |     keywords: list[str] = Field(
 32 |         default_factory=list,
 33 |         description="List of other keywords for filtering results.",
 34 |     )
 35 |     variants: list[str] = Field(
 36 |         default_factory=list,
 37 |         description="List of variants for filtering results.",
 38 |     )
 39 | 
 40 |     def iter_concepts(self) -> Generator[tuple[Concept, str], None, None]:
 41 |         for concept in concepts:
 42 |             field = concept + "s"
 43 |             values = getattr(self, field, []) or []
 44 |             for value in values:
 45 |                 yield concept, value
 46 | 
 47 | 
 48 | class PubtatorRequest(BaseModel):
 49 |     text: str
 50 |     size: int = 50
 51 | 
 52 | 
 53 | class ResultItem(BaseModel):
 54 |     pmid: int | None = None
 55 |     pmcid: str | None = None
 56 |     title: str | None = None
 57 |     journal: str | None = None
 58 |     authors: list[str] | None = None
 59 |     date: str | None = None
 60 |     doi: str | None = None
 61 |     abstract: str | None = None
 62 |     publication_state: PublicationState = PublicationState.PEER_REVIEWED
 63 |     source: str | None = Field(
 64 |         None, description="Source database (e.g., PubMed, bioRxiv, Europe PMC)"
 65 |     )
 66 | 
 67 |     @computed_field
 68 |     def pubmed_url(self) -> str | None:
 69 |         url = None
 70 |         if self.pmid:
 71 |             url = f"https://pubmed.ncbi.nlm.nih.gov/{self.pmid}/"
 72 |         return url
 73 | 
 74 |     @computed_field
 75 |     def pmc_url(self) -> str | None:
 76 |         """Generates the PMC URL if PMCID exists."""
 77 |         url = None
 78 |         if self.pmcid:
 79 |             url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{self.pmcid}/"
 80 |         return url
 81 | 
 82 |     @computed_field
 83 |     def doi_url(self) -> str | None:
 84 |         """Generates the DOI URL if DOI exists."""
 85 |         url = None
 86 |         if self.doi:
 87 |             url = f"https://doi.org/{self.doi}"
 88 |         return url
 89 | 
 90 | 
 91 | class SearchResponse(BaseModel):
 92 |     results: list[ResultItem]
 93 |     page_size: int
 94 |     current: int
 95 |     count: int
 96 |     total_pages: int
 97 | 
 98 | 
 99 | async def convert_request(request: PubmedRequest) -> PubtatorRequest:
100 |     query_parts = []
101 | 
102 |     # Process keywords with OR logic support
103 |     for keyword in request.keywords:
104 |         if "|" in keyword:
105 |             # Handle OR within a keyword (e.g., "R173|Arg173|p.R173")
106 |             or_terms = [term.strip() for term in keyword.split("|")]
107 |             or_query = "(" + " OR ".join(or_terms) + ")"
108 |             query_parts.append(or_query)
109 |         else:
110 |             query_parts.append(keyword)
111 | 
112 |     # Create all autocomplete tasks in parallel
113 |     autocomplete_tasks = []
114 |     concept_values = []
115 | 
116 |     for concept, value in request.iter_concepts():
117 |         task = autocomplete(
118 |             request=EntityRequest(concept=concept, query=value),
119 |         )
120 |         autocomplete_tasks.append(task)
121 |         concept_values.append((concept, value))
122 | 
123 |     # Execute all autocomplete calls in parallel
124 |     if autocomplete_tasks:
125 |         entities = await asyncio.gather(*autocomplete_tasks)
126 | 
127 |         # Process results
128 |         for (_concept, value), entity in zip(
129 |             concept_values, entities, strict=False
130 |         ):
131 |             if entity:
132 |                 query_parts.append(entity.entity_id)
133 |             else:
134 |                 query_parts.append(value)
135 | 
136 |     query_text = " AND ".join(query_parts)
137 | 
138 |     return PubtatorRequest(text=query_text, size=SYSTEM_PAGE_SIZE)
139 | 
140 | 
141 | async def add_abstracts(response: SearchResponse) -> None:
142 |     pmids = [pr.pmid for pr in response.results if pr.pmid]
143 |     abstract_response, _ = await call_pubtator_api(pmids, full=False)
144 | 
145 |     if abstract_response:
146 |         for result in response.results:
147 |             result.abstract = abstract_response.get_abstract(result.pmid)
148 | 
149 | 
150 | def clean_authors(record):
151 |     """Keep only the first and last author if > 4 authors."""
152 |     authors = record.get("authors")
153 |     if authors and len(authors) > 4:
154 |         record["authors"] = [authors[0], "...", authors[-1]]
155 |     return record
156 | 
157 | 
158 | async def search_articles(
159 |     request: PubmedRequest,
160 |     output_json: bool = False,
161 | ) -> str:
162 |     pubtator_request = await convert_request(request)
163 | 
164 |     # Start the search request
165 |     search_task = http_client.request_api(
166 |         url=PUBTATOR3_SEARCH_URL,
167 |         request=pubtator_request,
168 |         response_model_type=SearchResponse,
169 |         domain="article",
170 |     )
171 | 
172 |     # Execute search first
173 |     response, error = await search_task
174 | 
175 |     if response:
176 |         # Now fetch abstracts (still sequential but could be parallelized with other operations)
177 |         await add_abstracts(response)
178 |         # Add source field to PubMed results
179 |         for result in response.results:
180 |             result.source = "PubMed"
181 | 
182 |     # noinspection DuplicatedCode
183 |     if error:
184 |         data: list[dict[str, Any]] = [
185 |             {"error": f"Error {error.code}: {error.message}"}
186 |         ]
187 |     else:
188 |         data = list(
189 |             map(
190 |                 clean_authors,
191 |                 [
192 |                     result.model_dump(mode="json", exclude_none=True)
193 |                     for result in (response.results if response else [])
194 |                 ],
195 |             )
196 |         )
197 | 
198 |     if data and not output_json:
199 |         return render.to_markdown(data)
200 |     else:
201 |         return json.dumps(data, indent=2)
202 | 
203 | 
204 | async def _article_searcher(
205 |     call_benefit: Annotated[
206 |         str,
207 |         "Define and summarize why this function is being called and the intended benefit",
208 |     ],
209 |     chemicals: Annotated[
210 |         list[str] | str | None, "List of chemicals for filtering results"
211 |     ] = None,
212 |     diseases: Annotated[
213 |         list[str] | str | None,
214 |         "Diseases such as Hypertension, Lung Adenocarcinoma, etc.",
215 |     ] = None,
216 |     genes: Annotated[
217 |         list[str] | str | None, "List of genes for filtering results"
218 |     ] = None,
219 |     keywords: Annotated[
220 |         list[str] | str | None, "List of other keywords for filtering results"
221 |     ] = None,
222 |     variants: Annotated[
223 |         list[str] | str | None, "List of variants for filtering results"
224 |     ] = None,
225 |     include_preprints: Annotated[
226 |         bool, "Include preprint articles from bioRxiv/medRxiv and Europe PMC"
227 |     ] = True,
228 |     include_cbioportal: Annotated[
229 |         bool,
230 |         "Include cBioPortal cancer genomics summary when searching by gene",
231 |     ] = True,
232 | ) -> str:
233 |     """
234 |     Searches for articles across PubMed and preprint servers.
235 | 
236 |     Parameters:
237 |     - call_benefit: Define and summarize why this function is being called and the intended benefit
238 |     - chemicals: List of chemicals for filtering results
239 |     - diseases: Diseases such as Hypertension, Lung Adenocarcinoma, etc.
240 |     - genes: List of genes for filtering results
241 |     - keywords: List of other keywords for filtering results
242 |     - variants: List of variants for filtering results
243 |     - include_preprints: Include results from preprint servers (default: True)
244 |     - include_cbioportal: Include cBioPortal summaries for gene searches (default: True)
245 | 
246 |     Notes:
247 |     - Use full terms ("Non-small cell lung carcinoma") over abbreviations ("NSCLC")
248 |     - Use keywords to specify terms that don't fit in disease, gene ("EGFR"),
249 |       chemical ("Cisplatin"), or variant ("BRAF V600E") categories
250 |     - Parameters can be provided as lists or comma-separated strings
251 |     - Results include both peer-reviewed and preprint articles by default
252 |     - Keywords support OR logic using the pipe (|) separator:
253 |       - Example: "R173|Arg173|p.R173" finds articles with any of these notations
254 |       - Multiple keywords are still combined with AND logic
255 | 
256 |     Returns:
257 |     Markdown formatted list of matching articles, sorted by date (newest first),
258 |     with peer-reviewed articles listed before preprints.
259 |     Limited to max 20 results (10 from each source) by default to optimize token usage.
260 |     """
261 |     # Import here to avoid circular dependency
262 |     from .search_optimized import article_searcher_optimized
263 | 
264 |     # Use the optimized version with caching
265 |     return await article_searcher_optimized(
266 |         call_benefit=call_benefit,
267 |         chemicals=chemicals,
268 |         diseases=diseases,
269 |         genes=genes,
270 |         keywords=keywords,
271 |         variants=variants,
272 |         include_preprints=include_preprints,
273 |         include_cbioportal=include_cbioportal,
274 |     )
275 | 
```

--------------------------------------------------------------------------------
/docs/FDA_SECURITY.md:
--------------------------------------------------------------------------------

```markdown
  1 | # FDA Integration Security Documentation
  2 | 
  3 | ## Overview
  4 | 
  5 | This document outlines the security measures implemented in the BioMCP FDA integration to ensure safe handling of medical data and protection against common vulnerabilities.
  6 | 
  7 | ## Security Features
  8 | 
  9 | ### 1. Input Validation & Sanitization
 10 | 
 11 | All user inputs are validated and sanitized before being sent to the FDA API:
 12 | 
 13 | - **Injection Prevention**: Removes characters that could be used for SQL injection, XSS, or command injection (`<>\"';&|\\`)
 14 | - **Length Limits**: Enforces maximum lengths on all input fields
 15 | - **Type Validation**: Ensures parameters match expected types (dates, numbers, etc.)
 16 | - **Format Validation**: Validates specific formats (e.g., YYYY-MM-DD for dates)
 17 | 
 18 | **Implementation**: `src/biomcp/openfda/input_validation.py`
 19 | 
 20 | ```python
 21 | # Example usage
 22 | from biomcp.openfda.input_validation import sanitize_input, validate_drug_name
 23 | 
 24 | safe_drug = validate_drug_name("Aspirin<script>")  # Returns "Aspirin"
 25 | safe_input = sanitize_input("'; DROP TABLE;")  # SQL injection blocked
 26 | ```
 27 | 
 28 | ### 2. API Key Protection
 29 | 
 30 | API keys are protected at multiple levels:
 31 | 
 32 | - **Cache Key Exclusion**: API keys are removed before generating cache keys
 33 | - **No Logging**: API keys are never logged, even in debug mode
 34 | - **Environment Variables**: Keys stored in environment variables, not in code
 35 | - **Validation**: API key format is validated before use
 36 | 
 37 | **Implementation**: `src/biomcp/openfda/cache.py`, `src/biomcp/openfda/utils.py`
 38 | 
 39 | ### 3. Rate Limiting
 40 | 
 41 | Client-side rate limiting prevents API quota exhaustion:
 42 | 
 43 | - **Token Bucket Algorithm**: Allows bursts while maintaining average rate
 44 | - **Configurable Limits**: 40 requests/minute without key, 240 with key
 45 | - **Concurrent Request Limiting**: Maximum 10 concurrent requests via semaphore
 46 | - **Automatic Backoff**: Delays requests when approaching limits
 47 | 
 48 | **Implementation**: `src/biomcp/openfda/rate_limiter.py`
 49 | 
 50 | ### 4. Circuit Breaker Pattern
 51 | 
 52 | Prevents cascading failures when FDA API is unavailable:
 53 | 
 54 | - **Failure Threshold**: Opens after 5 consecutive failures
 55 | - **Recovery Timeout**: Waits 60 seconds before retry attempts
 56 | - **Half-Open State**: Tests recovery with limited requests
 57 | - **Automatic Recovery**: Returns to normal operation when API recovers
 58 | 
 59 | **States**:
 60 | 
 61 | - **CLOSED**: Normal operation
 62 | - **OPEN**: Blocking all requests (API is down)
 63 | - **HALF_OPEN**: Testing if API has recovered
 64 | 
 65 | ### 5. Memory Protection
 66 | 
 67 | Prevents memory exhaustion from large responses:
 68 | 
 69 | - **Response Size Limits**: Maximum 1MB per cached response
 70 | - **Cache Size Limits**: Maximum 100 entries in cache
 71 | - **FIFO Eviction**: Oldest entries removed when cache is full
 72 | - **Size Validation**: Large responses rejected before caching
 73 | 
 74 | **Configuration**:
 75 | 
 76 | ```bash
 77 | export BIOMCP_FDA_MAX_RESPONSE_SIZE=1048576  # 1MB
 78 | export BIOMCP_FDA_MAX_CACHE_SIZE=100
 79 | ```
 80 | 
 81 | ### 6. File Operation Security
 82 | 
 83 | Secure handling of cache files:
 84 | 
 85 | - **File Locking**: Uses `fcntl` for exclusive/shared locks
 86 | - **Atomic Operations**: Writes to temp files then renames
 87 | - **Race Condition Prevention**: Locks prevent concurrent modifications
 88 | - **Permission Control**: Files created without world-write permissions
 89 | 
 90 | **Implementation**: `src/biomcp/openfda/drug_shortages.py`
 91 | 
 92 | ## Security Best Practices
 93 | 
 94 | ### For Developers
 95 | 
 96 | 1. **Never Log Sensitive Data**
 97 | 
 98 |    ```python
 99 |    # BAD
100 |    logger.debug(f"API key: {api_key}")
101 | 
102 |    # GOOD
103 |    logger.debug("API key configured" if api_key else "No API key")
104 |    ```
105 | 
106 | 2. **Always Validate Input**
107 | 
108 |    ```python
109 |    from biomcp.openfda.input_validation import validate_drug_name
110 | 
111 |    # Always validate before using
112 |    safe_drug = validate_drug_name(user_input)
113 |    if safe_drug:
114 |        # Use safe_drug, not user_input
115 |        await search_adverse_events(drug=safe_drug)
116 |    ```
117 | 
118 | 3. **Use Rate Limiting**
119 | 
120 |    ```python
121 |    from biomcp.openfda.rate_limiter import rate_limited_request
122 | 
123 |    # Wrap API calls with rate limiting
124 |    result = await rate_limited_request(make_api_call, params)
125 |    ```
126 | 
127 | ### For System Administrators
128 | 
129 | 1. **API Key Management**
130 | 
131 |    - Store API keys in environment variables
132 |    - Rotate keys regularly (recommended: every 90 days)
133 |    - Use different keys for dev/staging/production
134 |    - Monitor key usage for anomalies
135 | 
136 | 2. **Monitoring**
137 | 
138 |    - Set up alerts for circuit breaker state changes
139 |    - Monitor rate limit consumption
140 |    - Track cache hit/miss ratios
141 |    - Log validation failures (potential attacks)
142 | 
143 | 3. **Resource Limits**
144 |    ```bash
145 |    # Configure limits based on your environment
146 |    export BIOMCP_FDA_CACHE_TTL=15  # Minutes
147 |    export BIOMCP_FDA_MAX_CACHE_SIZE=100
148 |    export BIOMCP_FDA_MAX_RESPONSE_SIZE=1048576  # 1MB
149 |    ```
150 | 
151 | ## Threat Model
152 | 
153 | ### Threats Addressed
154 | 
155 | | Threat              | Mitigation                  | Implementation         |
156 | | ------------------- | --------------------------- | ---------------------- |
157 | | SQL Injection       | Input sanitization          | `input_validation.py`  |
158 | | XSS Attacks         | HTML/JS character removal   | `sanitize_input()`     |
159 | | Command Injection   | Shell metacharacter removal | `sanitize_input()`     |
160 | | API Key Exposure    | Exclusion from logs/cache   | `cache.py`, `utils.py` |
161 | | DoS via Rate Limits | Client-side rate limiting   | `rate_limiter.py`      |
162 | | Cascading Failures  | Circuit breaker pattern     | `CircuitBreaker` class |
163 | | Memory Exhaustion   | Response size limits        | `MAX_RESPONSE_SIZE`    |
164 | | Race Conditions     | File locking                | `fcntl` usage          |
165 | | Cache Poisoning     | Input validation            | `build_safe_query()`   |
166 | 
167 | ### Residual Risks
168 | 
169 | 1. **API Key Compromise**: If environment is compromised, keys are accessible
170 | 
171 |    - **Mitigation**: Use secret management systems in production
172 | 
173 | 2. **Zero-Day FDA API Vulnerabilities**: Unknown vulnerabilities in FDA API
174 | 
175 |    - **Mitigation**: Monitor FDA security advisories
176 | 
177 | 3. **Distributed DoS**: Multiple clients could still overwhelm FDA API
178 |    - **Mitigation**: Implement global rate limiting at gateway level
179 | 
180 | ## Compliance Considerations
181 | 
182 | ### HIPAA (If Applicable)
183 | 
184 | While FDA's public APIs don't contain PHI, if extended to include patient data:
185 | 
186 | 1. **Encryption**: Use TLS for all API communications
187 | 2. **Audit Logging**: Log all data access (but not the data itself)
188 | 3. **Access Controls**: Implement user authentication/authorization
189 | 4. **Data Retention**: Define and enforce retention policies
190 | 
191 | ### FDA Data Usage
192 | 
193 | 1. **Attribution**: Always include FDA disclaimers in responses
194 | 2. **Data Currency**: Warn users that data may not be real-time
195 | 3. **Medical Decisions**: Explicitly state data is not for clinical decisions
196 | 4. **Rate Limits**: Respect FDA's terms of service
197 | 
198 | ## Security Testing
199 | 
200 | ### Automated Tests
201 | 
202 | Run security tests with:
203 | 
204 | ```bash
205 | pytest tests/tdd/openfda/test_security.py -v
206 | ```
207 | 
208 | Tests cover:
209 | 
210 | - Input validation
211 | - Cache key security
212 | - Rate limiting
213 | - Circuit breaker
214 | - File operations
215 | 
216 | ### Manual Security Review
217 | 
218 | Checklist for security review:
219 | 
220 | - [ ] No sensitive data in logs
221 | - [ ] All inputs validated
222 | - [ ] Rate limiting functional
223 | - [ ] Circuit breaker triggers correctly
224 | - [ ] Cache size limited
225 | - [ ] File operations are atomic
226 | - [ ] API keys not in cache keys
227 | - [ ] Error messages don't leak information
228 | 
229 | ## Incident Response
230 | 
231 | ### If API Key is Compromised
232 | 
233 | 1. **Immediate**: Revoke compromised key at FDA portal
234 | 2. **Generate**: Create new API key
235 | 3. **Update**: Update environment variables
236 | 4. **Restart**: Restart services to load new key
237 | 5. **Audit**: Review logs for unauthorized usage
238 | 
239 | ### If Rate Limits Exceeded
240 | 
241 | 1. **Check**: Verify circuit breaker state
242 | 2. **Wait**: Allow circuit breaker recovery timeout
243 | 3. **Reduce**: Lower request rate if needed
244 | 4. **Monitor**: Check for abnormal usage patterns
245 | 
246 | ### If Security Vulnerability Found
247 | 
248 | 1. **Assess**: Determine severity and exploitability
249 | 2. **Patch**: Develop and test fix
250 | 3. **Deploy**: Roll out fix with monitoring
251 | 4. **Document**: Update this security documentation
252 | 5. **Notify**: Inform users if data was at risk
253 | 
254 | ## Configuration Reference
255 | 
256 | ### Environment Variables
257 | 
258 | | Variable                       | Default | Description                        |
259 | | ------------------------------ | ------- | ---------------------------------- |
260 | | `OPENFDA_API_KEY`              | None    | FDA API key for higher rate limits |
261 | | `BIOMCP_FDA_CACHE_TTL`         | 15      | Cache TTL in minutes               |
262 | | `BIOMCP_FDA_MAX_CACHE_SIZE`    | 100     | Maximum cache entries              |
263 | | `BIOMCP_FDA_MAX_RESPONSE_SIZE` | 1048576 | Maximum response size in bytes     |
264 | | `BIOMCP_SHORTAGE_CACHE_TTL`    | 24      | Drug shortage cache TTL in hours   |
265 | 
266 | ### Security Headers
267 | 
268 | When deploying as a web service, add these headers:
269 | 
270 | ```python
271 | headers = {
272 |     "X-Content-Type-Options": "nosniff",
273 |     "X-Frame-Options": "DENY",
274 |     "X-XSS-Protection": "1; mode=block",
275 |     "Strict-Transport-Security": "max-age=31536000; includeSubDomains",
276 |     "Content-Security-Policy": "default-src 'self'"
277 | }
278 | ```
279 | 
280 | ## Contact
281 | 
282 | For security issues, contact: [email protected] (create this address)
283 | 
284 | For FDA API issues, see: https://open.fda.gov/apis/
285 | 
286 | ---
287 | 
288 | _Last Updated: 2025-08-07_
289 | _Version: 1.0_
290 | 
```

--------------------------------------------------------------------------------
/tests/tdd/variants/test_cbioportal_search.py:
--------------------------------------------------------------------------------

```python
  1 | """Test cBioPortal search enhancements."""
  2 | 
  3 | import asyncio
  4 | 
  5 | import pytest
  6 | 
  7 | from biomcp.variants.cbioportal_search import (
  8 |     CBioPortalSearchClient,
  9 |     CBioPortalSearchSummary,
 10 |     format_cbioportal_search_summary,
 11 | )
 12 | from biomcp.variants.search import VariantQuery, search_variants
 13 | 
 14 | from .constants import API_RETRY_DELAY_SECONDS, DEFAULT_MAX_STUDIES
 15 | 
 16 | 
 17 | class TestCBioPortalSearch:
 18 |     """Test cBioPortal search functionality."""
 19 | 
 20 |     @pytest.mark.asyncio
 21 |     @pytest.mark.integration
 22 |     async def test_gene_search_summary(self):
 23 |         """Test getting gene search summary from cBioPortal."""
 24 |         client = CBioPortalSearchClient()
 25 | 
 26 |         # Test with BRAF
 27 |         summary = await client.get_gene_search_summary("BRAF", max_studies=5)
 28 | 
 29 |         assert summary is not None
 30 |         assert summary.gene == "BRAF"
 31 | 
 32 |         # Handle case where cBioPortal API returns empty data
 33 |         if summary.total_mutations == 0:
 34 |             # API might be down or returning empty results
 35 |             # This is acceptable for integration tests
 36 |             assert summary.total_mutations == 0
 37 |             assert summary.total_samples_tested == 0
 38 |             assert summary.mutation_frequency == 0.0
 39 |             assert len(summary.hotspots) == 0
 40 |         else:
 41 |             # Normal case - data is available
 42 |             assert summary.total_mutations > 0
 43 |             assert summary.total_samples_tested > 0
 44 |             assert summary.mutation_frequency > 0
 45 |             assert len(summary.hotspots) > 0
 46 | 
 47 |             # Check that V600E is a top hotspot
 48 |             v600e_found = any(
 49 |                 "V600E" in hs.amino_acid_change for hs in summary.hotspots
 50 |             )
 51 |             assert v600e_found, "BRAF V600E should be a top hotspot"
 52 | 
 53 |         # Check cancer distribution
 54 |         if summary.total_mutations > 0:
 55 |             assert len(summary.cancer_distribution) > 0
 56 |             assert any(
 57 |                 "melanoma" in cancer.lower()
 58 |                 for cancer in summary.cancer_distribution
 59 |             ), "BRAF should be found in melanoma"
 60 |         else:
 61 |             # When no mutations found, cancer distribution should be empty
 62 |             assert len(summary.cancer_distribution) == 0
 63 | 
 64 |     @pytest.mark.asyncio
 65 |     @pytest.mark.integration
 66 |     async def test_format_search_summary(self):
 67 |         """Test formatting of search summary."""
 68 |         # Create a mock summary
 69 |         summary = CBioPortalSearchSummary(
 70 |             gene="BRAF",
 71 |             total_mutations=1000,
 72 |             total_samples_tested=10000,
 73 |             mutation_frequency=0.1,
 74 |             hotspots=[
 75 |                 {
 76 |                     "position": 600,
 77 |                     "amino_acid_change": "V600E",
 78 |                     "count": 800,
 79 |                     "frequency": 0.8,
 80 |                     "cancer_types": ["Melanoma", "Colorectal Cancer"],
 81 |                 }
 82 |             ],
 83 |             cancer_distribution={"Melanoma": 600, "Colorectal Cancer": 200},
 84 |             study_coverage={
 85 |                 "total_studies": 50,
 86 |                 "queried_studies": 10,
 87 |                 "studies_with_data": 8,
 88 |             },
 89 |         )
 90 | 
 91 |         formatted = format_cbioportal_search_summary(summary)
 92 | 
 93 |         assert "BRAF" in formatted
 94 |         assert "10.0%" in formatted  # Mutation frequency
 95 |         assert "V600E" in formatted
 96 |         assert "Melanoma" in formatted
 97 |         assert "600 mutations" in formatted
 98 | 
 99 |     @pytest.mark.asyncio
100 |     @pytest.mark.integration
101 |     async def test_search_with_cbioportal_summary(self):
102 |         """Test variant search with cBioPortal summary included."""
103 |         query = VariantQuery(gene="BRAF", size=5)
104 | 
105 |         result = await search_variants(query, include_cbioportal=True)
106 | 
107 |         # Should include cBioPortal summary section
108 |         assert "cBioPortal Summary for BRAF" in result
109 |         assert "Mutation Frequency" in result
110 |         # Top Hotspots only appears when mutations are found
111 |         # Check for either Top Hotspots or 0 mutations message
112 |         assert "Top Hotspots" in result or "0 mutations" in result
113 | 
114 |         # Should still include variant results
115 |         assert "# Record" in result or "No variants found" in result
116 | 
117 |     @pytest.mark.asyncio
118 |     @pytest.mark.integration
119 |     async def test_search_without_gene(self):
120 |         """Test that cBioPortal summary is not included without gene parameter."""
121 |         query = VariantQuery(rsid="rs113488022", size=5)
122 | 
123 |         result = await search_variants(query, include_cbioportal=True)
124 | 
125 |         # Should not include cBioPortal summary
126 |         assert "cBioPortal Summary" not in result
127 | 
128 |     @pytest.mark.asyncio
129 |     @pytest.mark.integration
130 |     async def test_tp53_search_summary(self):
131 |         """Test TP53 gene search summary."""
132 |         client = CBioPortalSearchClient()
133 | 
134 |         # Clear any caches to ensure fresh data
135 |         from biomcp.utils.request_cache import clear_cache
136 | 
137 |         await clear_cache()
138 | 
139 |         summary = await client.get_gene_search_summary("TP53", max_studies=5)
140 | 
141 |         assert summary is not None
142 |         assert summary.gene == "TP53"
143 | 
144 |         # If we got no mutations, it might be a temporary API issue
145 |         if summary.total_mutations == 0 and summary.total_samples_tested == 0:
146 |             # Try one more time with a small delay
147 |             await asyncio.sleep(API_RETRY_DELAY_SECONDS)
148 |             summary = await client.get_gene_search_summary(
149 |                 "TP53", max_studies=5
150 |             )
151 | 
152 |             # If still no data, skip the test rather than fail
153 |             if summary.total_mutations == 0:
154 |                 pytest.skip(
155 |                     "cBioPortal returned no mutation data for TP53 - possible API issue"
156 |                 )
157 | 
158 |         # Basic checks that should pass when data is available
159 |         assert (
160 |             summary.total_mutations > 0
161 |         ), f"TP53 should have mutations. Got: {summary}"
162 | 
163 |         # More flexible checks
164 |         if summary.hotspots:
165 |             # Just verify structure if we have hotspots
166 |             hotspot_changes = [hs.amino_acid_change for hs in summary.hotspots]
167 |             print(f"TP53 hotspots found: {hotspot_changes[:5]}")
168 |             assert (
169 |                 len(hotspot_changes) >= 1
170 |             ), "Should find at least one TP53 hotspot"
171 | 
172 |     @pytest.mark.asyncio
173 |     @pytest.mark.integration
174 |     async def test_kras_search_summary(self):
175 |         """Test KRAS gene search summary.
176 | 
177 |         This test verifies basic functionality rather than specific hotspots,
178 |         which can change as cBioPortal data is updated.
179 |         """
180 |         client = CBioPortalSearchClient()
181 | 
182 |         # Clear any caches to ensure fresh data
183 |         from biomcp.utils.request_cache import clear_cache
184 | 
185 |         await clear_cache()
186 | 
187 |         summary = await client.get_gene_search_summary(
188 |             "KRAS", max_studies=DEFAULT_MAX_STUDIES
189 |         )
190 | 
191 |         assert summary is not None, "Failed to get summary for KRAS"
192 |         assert summary.gene == "KRAS"
193 | 
194 |         # If we got no mutations, it might be a temporary API issue
195 |         if summary.total_mutations == 0 and summary.total_samples_tested == 0:
196 |             # Try one more time with a small delay
197 |             await asyncio.sleep(API_RETRY_DELAY_SECONDS)
198 |             summary = await client.get_gene_search_summary(
199 |                 "KRAS", max_studies=DEFAULT_MAX_STUDIES
200 |             )
201 | 
202 |             # If still no data, skip the test rather than fail
203 |             if summary.total_mutations == 0:
204 |                 pytest.skip(
205 |                     "cBioPortal returned no mutation data for KRAS - possible API issue"
206 |                 )
207 | 
208 |         # Basic checks that should pass when data is available
209 |         assert (
210 |             summary.total_mutations > 0
211 |         ), f"KRAS should have mutations. Got: {summary}"
212 | 
213 |         # More flexible checks
214 |         if summary.hotspots:
215 |             # Just verify structure if we have hotspots
216 |             for hotspot in summary.hotspots[:3]:
217 |                 assert hasattr(hotspot, "amino_acid_change")
218 |                 assert hasattr(hotspot, "count")
219 |             print(
220 |                 f"Top KRAS hotspots: {[hs.amino_acid_change for hs in summary.hotspots[:5]]}"
221 |             )
222 | 
223 |         # Cancer distribution check - only if we have data
224 |         if summary.total_mutations > 0:
225 |             assert (
226 |                 len(summary.cancer_distribution) > 0
227 |             ), "Should have cancer type distribution"
228 | 
229 |     @pytest.mark.asyncio
230 |     @pytest.mark.integration
231 |     async def test_invalid_gene(self):
232 |         """Test handling of invalid gene name."""
233 |         client = CBioPortalSearchClient()
234 | 
235 |         summary = await client.get_gene_search_summary("INVALID_GENE")
236 | 
237 |         assert summary is None
238 | 
239 |     @pytest.mark.asyncio
240 |     @pytest.mark.integration
241 |     async def test_json_output_with_cbioportal(self):
242 |         """Test JSON output includes cBioPortal summary."""
243 |         query = VariantQuery(gene="BRAF", size=2)
244 | 
245 |         result = await search_variants(
246 |             query, output_json=True, include_cbioportal=True
247 |         )
248 | 
249 |         # Parse JSON
250 |         import json
251 | 
252 |         data = json.loads(result)
253 | 
254 |         # Should have both summary and variants
255 |         assert "cbioportal_summary" in data
256 |         assert "variants" in data
257 |         assert "BRAF" in data["cbioportal_summary"]
258 | 
```

--------------------------------------------------------------------------------
/tests/tdd/articles/test_unified.py:
--------------------------------------------------------------------------------

```python
  1 | """Tests for unified article search functionality."""
  2 | 
  3 | import json
  4 | from unittest.mock import AsyncMock, patch
  5 | 
  6 | import pytest
  7 | 
  8 | from biomcp.articles.search import PubmedRequest
  9 | from biomcp.articles.unified import (
 10 |     _deduplicate_articles,
 11 |     _parse_search_results,
 12 |     search_articles_unified,
 13 | )
 14 | 
 15 | 
 16 | class TestUnifiedSearch:
 17 |     """Test unified search functionality."""
 18 | 
 19 |     @pytest.fixture
 20 |     def pubmed_results(self):
 21 |         """Sample PubMed results in JSON format."""
 22 |         return json.dumps([
 23 |             {
 24 |                 "pmid": 12345,
 25 |                 "title": "BRAF mutations in cancer",
 26 |                 "doi": "10.1234/test1",
 27 |                 "date": "2024-01-15",
 28 |                 "publication_state": "peer_reviewed",
 29 |             },
 30 |             {
 31 |                 "pmid": 12346,
 32 |                 "title": "Another cancer study",
 33 |                 "doi": "10.1234/test2",
 34 |                 "date": "2024-01-10",
 35 |                 "publication_state": "peer_reviewed",
 36 |             },
 37 |         ])
 38 | 
 39 |     @pytest.fixture
 40 |     def preprint_results(self):
 41 |         """Sample preprint results in JSON format."""
 42 |         return json.dumps([
 43 |             {
 44 |                 "title": "BRAF preprint study",
 45 |                 "doi": "10.1101/2024.01.20.123456",
 46 |                 "date": "2024-01-20",
 47 |                 "publication_state": "preprint",
 48 |                 "source": "bioRxiv",
 49 |             },
 50 |             {
 51 |                 "title": "Duplicate study",
 52 |                 "doi": "10.1234/test1",  # Same DOI as PubMed result
 53 |                 "date": "2024-01-14",
 54 |                 "publication_state": "preprint",
 55 |                 "source": "Europe PMC",
 56 |             },
 57 |         ])
 58 | 
 59 |     @pytest.mark.asyncio
 60 |     async def test_search_articles_unified_both_sources(
 61 |         self, pubmed_results, preprint_results
 62 |     ):
 63 |         """Test searching with both PubMed and preprints enabled."""
 64 |         request = PubmedRequest(genes=["BRAF"])
 65 | 
 66 |         mock_pubmed = AsyncMock(return_value=pubmed_results)
 67 |         mock_preprints = AsyncMock(return_value=preprint_results)
 68 | 
 69 |         with (
 70 |             patch("biomcp.articles.unified.search_articles", mock_pubmed),
 71 |             patch("biomcp.articles.unified.search_preprints", mock_preprints),
 72 |             patch(
 73 |                 "biomcp.variants.cbioportal_search.CBioPortalSearchClient"
 74 |             ) as mock_cbio,
 75 |         ):
 76 |             # Mock cBioPortal client to return None (no summary)
 77 |             mock_cbio.return_value.get_gene_search_summary = AsyncMock(
 78 |                 return_value=None
 79 |             )
 80 | 
 81 |             result = await search_articles_unified(
 82 |                 request,
 83 |                 include_pubmed=True,
 84 |                 include_preprints=True,
 85 |                 output_json=True,
 86 |             )
 87 | 
 88 |             # Parse result
 89 |             data = json.loads(result)
 90 | 
 91 |             # When gene is specified but cBioPortal returns no data,
 92 |             # we should just get the articles list
 93 |             if isinstance(data, dict):
 94 |                 articles = data.get("articles", data)
 95 |             else:
 96 |                 articles = data
 97 | 
 98 |             # Should have 3 articles (one duplicate removed)
 99 |             assert len(articles) == 3
100 | 
101 |             # Check ordering - peer reviewed should come first
102 |             # Sort is by (publication_state priority, date DESC)
103 |             # The test data has preprint with newer date, so it might come first
104 |             # Let's just check we have the right mix
105 |             states = [a["publication_state"] for a in articles]
106 |             assert states.count("peer_reviewed") == 2
107 |             assert states.count("preprint") == 1
108 | 
109 |             # Check deduplication worked
110 |             dois = [a.get("doi") for a in articles if a.get("doi")]
111 |             assert len(dois) == len(set(dois))  # No duplicate DOIs
112 | 
113 |     @pytest.mark.asyncio
114 |     async def test_search_articles_unified_pubmed_only(self, pubmed_results):
115 |         """Test searching with only PubMed enabled."""
116 |         request = PubmedRequest(
117 |             keywords=["cancer"]
118 |         )  # No gene, so no cBioPortal
119 | 
120 |         with (
121 |             patch("biomcp.articles.unified.search_articles") as mock_pubmed,
122 |             patch(
123 |                 "biomcp.articles.unified.search_preprints"
124 |             ) as mock_preprints,
125 |         ):
126 |             mock_pubmed.return_value = pubmed_results
127 | 
128 |             result = await search_articles_unified(
129 |                 request,
130 |                 include_pubmed=True,
131 |                 include_preprints=False,
132 |                 output_json=True,
133 |             )
134 | 
135 |             # Preprints should not be called
136 |             mock_preprints.assert_not_called()
137 | 
138 |             # Parse result
139 |             articles = json.loads(result)
140 |             assert len(articles) == 2
141 |             assert all(
142 |                 a["publication_state"] == "peer_reviewed" for a in articles
143 |             )
144 | 
145 |     @pytest.mark.asyncio
146 |     async def test_search_articles_unified_preprints_only(
147 |         self, preprint_results
148 |     ):
149 |         """Test searching with only preprints enabled."""
150 |         request = PubmedRequest(
151 |             keywords=["cancer"]
152 |         )  # No gene, so no cBioPortal
153 | 
154 |         with (
155 |             patch("biomcp.articles.unified.search_articles") as mock_pubmed,
156 |             patch(
157 |                 "biomcp.articles.unified.search_preprints"
158 |             ) as mock_preprints,
159 |         ):
160 |             mock_preprints.return_value = preprint_results
161 | 
162 |             result = await search_articles_unified(
163 |                 request,
164 |                 include_pubmed=False,
165 |                 include_preprints=True,
166 |                 output_json=True,
167 |             )
168 | 
169 |             # PubMed should not be called
170 |             mock_pubmed.assert_not_called()
171 | 
172 |             # Parse result
173 |             articles = json.loads(result)
174 |             assert len(articles) == 2
175 |             assert all(a["publication_state"] == "preprint" for a in articles)
176 | 
177 |     @pytest.mark.asyncio
178 |     async def test_search_articles_unified_error_handling(self):
179 |         """Test error handling when one source fails."""
180 |         request = PubmedRequest(
181 |             keywords=["cancer"]
182 |         )  # No gene, so no cBioPortal
183 | 
184 |         with (
185 |             patch("biomcp.articles.unified.search_articles") as mock_pubmed,
186 |             patch(
187 |                 "biomcp.articles.unified.search_preprints"
188 |             ) as mock_preprints,
189 |         ):
190 |             # PubMed succeeds
191 |             mock_pubmed.return_value = json.dumps([{"title": "Success"}])
192 |             # Preprints fails
193 |             mock_preprints.side_effect = Exception("API Error")
194 | 
195 |             result = await search_articles_unified(
196 |                 request,
197 |                 include_pubmed=True,
198 |                 include_preprints=True,
199 |                 output_json=True,
200 |             )
201 | 
202 |             # Should still get PubMed results
203 |             articles = json.loads(result)
204 |             assert len(articles) == 1
205 |             assert articles[0]["title"] == "Success"
206 | 
207 |     @pytest.mark.asyncio
208 |     async def test_search_articles_unified_markdown_output(
209 |         self, pubmed_results
210 |     ):
211 |         """Test markdown output format."""
212 |         request = PubmedRequest(genes=["BRAF"])
213 | 
214 |         mock_pubmed = AsyncMock(return_value=pubmed_results)
215 | 
216 |         with patch("biomcp.articles.unified.search_articles", mock_pubmed):
217 |             result = await search_articles_unified(
218 |                 request,
219 |                 include_pubmed=True,
220 |                 include_preprints=False,
221 |                 output_json=False,
222 |             )
223 | 
224 |             # Should return markdown
225 |             assert isinstance(result, str)
226 |             assert "BRAF mutations in cancer" in result
227 |             assert "# Record" in result  # Markdown headers
228 | 
229 |     def test_deduplicate_articles(self):
230 |         """Test article deduplication logic."""
231 |         articles = [
232 |             {"title": "Article 1", "doi": "10.1234/test1"},
233 |             {"title": "Article 2", "doi": "10.1234/test2"},
234 |             {"title": "Duplicate of 1", "doi": "10.1234/test1"},
235 |             {"title": "No DOI article"},
236 |             {"title": "Another no DOI"},
237 |         ]
238 | 
239 |         deduped = _deduplicate_articles(articles)
240 | 
241 |         # Should have 4 articles (one duplicate removed)
242 |         assert len(deduped) == 4
243 | 
244 |         # Check DOIs are unique
245 |         dois = [a.get("doi") for a in deduped if a.get("doi")]
246 |         assert len(dois) == len(set(dois))
247 | 
248 |         # Articles without DOI should be preserved
249 |         no_doi_count = sum(1 for a in deduped if not a.get("doi"))
250 |         assert no_doi_count == 2
251 | 
252 |     def test_parse_search_results(self):
253 |         """Test parsing of search results from multiple sources."""
254 |         results = [
255 |             json.dumps([{"title": "Article 1"}, {"title": "Article 2"}]),
256 |             json.dumps([{"title": "Article 3"}]),
257 |             Exception("Failed source"),  # Should be skipped
258 |             "[invalid json",  # Should be skipped
259 |         ]
260 | 
261 |         parsed = _parse_search_results(results)
262 | 
263 |         # Should have 3 articles (2 + 1, skipping errors)
264 |         assert len(parsed) == 3
265 |         assert parsed[0]["title"] == "Article 1"
266 |         assert parsed[1]["title"] == "Article 2"
267 |         assert parsed[2]["title"] == "Article 3"
268 | 
269 |     def test_parse_search_results_empty(self):
270 |         """Test parsing with all empty/failed results."""
271 |         results = [
272 |             Exception("Failed"),
273 |             "[invalid",
274 |             json.dumps([]),  # Empty list
275 |         ]
276 | 
277 |         parsed = _parse_search_results(results)
278 |         assert parsed == []
279 | 
```

--------------------------------------------------------------------------------
/src/biomcp/openfda/device_events.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | OpenFDA Device Adverse Events (MAUDE) integration.
  3 | 
  4 | Focus on genomic/diagnostic devices relevant to precision oncology.
  5 | """
  6 | 
  7 | import logging
  8 | 
  9 | from .constants import (
 10 |     GENOMIC_DEVICE_PRODUCT_CODES,
 11 |     OPENFDA_DEFAULT_LIMIT,
 12 |     OPENFDA_DEVICE_EVENTS_URL,
 13 |     OPENFDA_DISCLAIMER,
 14 |     OPENFDA_MAX_LIMIT,
 15 | )
 16 | from .device_events_helpers import (
 17 |     analyze_device_problems,
 18 |     format_detailed_device_info,
 19 |     format_device_detail_header,
 20 |     format_device_distribution,
 21 |     format_device_report_summary,
 22 |     format_patient_details,
 23 |     format_top_problems,
 24 | )
 25 | from .utils import clean_text, format_count, make_openfda_request
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def _build_device_search_query(
 31 |     device: str | None,
 32 |     manufacturer: str | None,
 33 |     problem: str | None,
 34 |     product_code: str | None,
 35 |     genomics_only: bool,
 36 | ) -> str:
 37 |     """Build the search query for device events."""
 38 |     search_parts = []
 39 | 
 40 |     if device:
 41 |         # Build flexible search queries
 42 |         device_queries = []
 43 | 
 44 |         # First try exact match
 45 |         device_queries.extend([
 46 |             f'device.brand_name:"{device}"',
 47 |             f'device.generic_name:"{device}"',
 48 |             f'device.openfda.device_name:"{device}"',
 49 |         ])
 50 | 
 51 |         # For multi-word terms, also search for key words with wildcards
 52 |         # This helps match "FoundationOne CDx" to "F1CDX" or similar variations
 53 |         words = device.split()
 54 | 
 55 |         # If it's a multi-word query, add wildcard searches for significant words
 56 |         for word in words:
 57 |             # Skip common words and very short ones
 58 |             if len(word) > 3 and word.lower() not in [
 59 |                 "test",
 60 |                 "system",
 61 |                 "device",
 62 |             ]:
 63 |                 # Use prefix wildcard for better performance
 64 |                 device_queries.append(f"device.brand_name:{word}*")
 65 |                 device_queries.append(f"device.generic_name:{word}*")
 66 | 
 67 |         # Also try searching by removing spaces (e.g., "Foundation One" -> "FoundationOne")
 68 |         if len(words) > 1:
 69 |             combined = "".join(words)
 70 |             device_queries.append(f'device.brand_name:"{combined}"')
 71 |             device_queries.append(f'device.generic_name:"{combined}"')
 72 | 
 73 |         search_parts.append(f"({' OR '.join(device_queries)})")
 74 | 
 75 |     if manufacturer:
 76 |         # Search manufacturer field with both exact and wildcard matching
 77 |         mfr_queries = [
 78 |             f'device.manufacturer_d_name:"{manufacturer}"',
 79 |             f"device.manufacturer_d_name:*{manufacturer}*",
 80 |         ]
 81 |         search_parts.append(f"({' OR '.join(mfr_queries)})")
 82 | 
 83 |     if problem:
 84 |         search_parts.append(f'device.device_problem_text:"{problem}"')
 85 | 
 86 |     if product_code:
 87 |         search_parts.append(f'device.openfda.product_code:"{product_code}"')
 88 |     elif (
 89 |         genomics_only and not device
 90 |     ):  # Only apply genomics filter if no specific device is named
 91 |         # Filter to genomic device product codes
 92 |         code_parts = [
 93 |             f'device.openfda.product_code:"{code}"'
 94 |             for code in GENOMIC_DEVICE_PRODUCT_CODES
 95 |         ]
 96 |         if code_parts:
 97 |             search_parts.append(f"({' OR '.join(code_parts)})")
 98 | 
 99 |     return " AND ".join(search_parts)
100 | 
101 | 
102 | def _format_search_summary(
103 |     device: str | None,
104 |     manufacturer: str | None,
105 |     problem: str | None,
106 |     genomics_only: bool,
107 |     total: int,
108 | ) -> list[str]:
109 |     """Format the search summary section."""
110 |     output = []
111 | 
112 |     search_desc = []
113 |     if device:
114 |         search_desc.append(f"**Device**: {device}")
115 |     if manufacturer:
116 |         search_desc.append(f"**Manufacturer**: {manufacturer}")
117 |     if problem:
118 |         search_desc.append(f"**Problem**: {problem}")
119 |     if genomics_only:
120 |         search_desc.append("**Type**: Genomic/Diagnostic Devices")
121 | 
122 |     if search_desc:
123 |         output.append(" | ".join(search_desc))
124 |     output.append(
125 |         f"**Total Reports Found**: {format_count(total, 'report')}\n"
126 |     )
127 | 
128 |     return output
129 | 
130 | 
131 | async def search_device_events(
132 |     device: str | None = None,
133 |     manufacturer: str | None = None,
134 |     problem: str | None = None,
135 |     product_code: str | None = None,
136 |     genomics_only: bool = True,
137 |     limit: int = OPENFDA_DEFAULT_LIMIT,
138 |     skip: int = 0,
139 |     api_key: str | None = None,
140 | ) -> str:
141 |     """
142 |     Search FDA device adverse event reports (MAUDE).
143 | 
144 |     Args:
145 |         device: Device name to search for
146 |         manufacturer: Manufacturer name
147 |         problem: Device problem description
148 |         product_code: FDA product code
149 |         genomics_only: Filter to genomic/diagnostic devices only
150 |         limit: Maximum number of results
151 |         skip: Number of results to skip
152 |         api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var)
153 | 
154 |     Returns:
155 |         Formatted string with device event information
156 |     """
157 |     if not device and not manufacturer and not product_code and not problem:
158 |         return (
159 |             "⚠️ Please specify a device name, manufacturer, or problem to search.\n\n"
160 |             "Examples:\n"
161 |             "- Search by device: --device 'FoundationOne'\n"
162 |             "- Search by manufacturer: --manufacturer 'Illumina'\n"
163 |             "- Search by problem: --problem 'false positive'"
164 |         )
165 | 
166 |     # Build and execute search
167 |     search_query = _build_device_search_query(
168 |         device, manufacturer, problem, product_code, genomics_only
169 |     )
170 |     params = {
171 |         "search": search_query,
172 |         "limit": min(limit, OPENFDA_MAX_LIMIT),
173 |         "skip": skip,
174 |     }
175 | 
176 |     response, error = await make_openfda_request(
177 |         OPENFDA_DEVICE_EVENTS_URL, params, "openfda_device_events", api_key
178 |     )
179 | 
180 |     if error:
181 |         return f"⚠️ Error searching device events: {error}"
182 | 
183 |     if not response or not response.get("results"):
184 |         return _format_no_results(device, manufacturer, problem, genomics_only)
185 | 
186 |     results = response["results"]
187 |     total = (
188 |         response.get("meta", {}).get("results", {}).get("total", len(results))
189 |     )
190 | 
191 |     # Build output
192 |     output = ["## FDA Device Adverse Event Reports\n"]
193 |     output.extend(
194 |         _format_search_summary(
195 |             device, manufacturer, problem, genomics_only, total
196 |         )
197 |     )
198 | 
199 |     # Analyze and format problems
200 |     all_problems, all_device_names, _ = analyze_device_problems(results)
201 |     output.extend(format_top_problems(all_problems, results))
202 | 
203 |     # Show device distribution if searching by problem
204 |     if problem:
205 |         output.extend(format_device_distribution(all_device_names, results))
206 | 
207 |     # Display sample reports
208 |     output.append(
209 |         f"### Sample Reports (showing {min(len(results), 3)} of {total}):\n"
210 |     )
211 |     for i, result in enumerate(results[:3], 1):
212 |         output.extend(format_device_report_summary(result, i))
213 | 
214 |     # Add tips
215 |     if genomics_only:
216 |         output.append(
217 |             "\n💡 **Note**: Results filtered to genomic/diagnostic devices. "
218 |             "Use --no-genomics-only to search all medical devices."
219 |         )
220 | 
221 |     output.append(f"\n{OPENFDA_DISCLAIMER}")
222 |     return "\n".join(output)
223 | 
224 | 
225 | def _format_no_results(
226 |     device: str | None,
227 |     manufacturer: str | None,
228 |     problem: str | None,
229 |     genomics_only: bool,
230 | ) -> str:
231 |     """Format no results message."""
232 |     search_desc = []
233 |     if device:
234 |         search_desc.append(f"device '{device}'")
235 |     if manufacturer:
236 |         search_desc.append(f"manufacturer '{manufacturer}'")
237 |     if problem:
238 |         search_desc.append(f"problem '{problem}'")
239 | 
240 |     desc = " and ".join(search_desc)
241 |     if genomics_only:
242 |         desc += " (filtered to genomic/diagnostic devices)"
243 | 
244 |     return f"No device adverse event reports found for {desc}."
245 | 
246 | 
247 | async def get_device_event(
248 |     mdr_report_key: str, api_key: str | None = None
249 | ) -> str:
250 |     """
251 |     Get detailed information for a specific device event report.
252 | 
253 |     Args:
254 |         mdr_report_key: MDR report key
255 |         api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var)
256 | 
257 |     Returns:
258 |         Formatted string with detailed report information
259 |     """
260 |     params = {
261 |         "search": f'mdr_report_key:"{mdr_report_key}"',
262 |         "limit": 1,
263 |     }
264 | 
265 |     response, error = await make_openfda_request(
266 |         OPENFDA_DEVICE_EVENTS_URL,
267 |         params,
268 |         "openfda_device_event_detail",
269 |         api_key,
270 |     )
271 | 
272 |     if error:
273 |         return f"⚠️ Error retrieving device event report: {error}"
274 | 
275 |     if not response or not response.get("results"):
276 |         return f"Device event report '{mdr_report_key}' not found."
277 | 
278 |     result = response["results"][0]
279 | 
280 |     # Build detailed output
281 |     output = format_device_detail_header(result, mdr_report_key)
282 | 
283 |     # Device details
284 |     if devices := result.get("device", []):
285 |         output.extend(format_detailed_device_info(devices))
286 | 
287 |     # Event narrative
288 |     if event_desc := result.get("event_description"):
289 |         output.append("### Event Description")
290 |         output.append(clean_text(event_desc))
291 |         output.append("")
292 | 
293 |     # Manufacturer narrative
294 |     if mfr_narrative := result.get("manufacturer_narrative"):
295 |         output.append("### Manufacturer's Analysis")
296 |         output.append(clean_text(mfr_narrative))
297 |         output.append("")
298 | 
299 |     # Patient information
300 |     if patient := result.get("patient", []):
301 |         output.extend(format_patient_details(patient))
302 | 
303 |     # Remedial action
304 |     if remedial := result.get("remedial_action"):
305 |         output.append("### Remedial Action")
306 |         if isinstance(remedial, list):
307 |             output.append(", ".join(remedial))
308 |         else:
309 |             output.append(remedial)
310 |         output.append("")
311 | 
312 |     output.append(f"\n{OPENFDA_DISCLAIMER}")
313 |     return "\n".join(output)
314 | 
```

--------------------------------------------------------------------------------
/docs/troubleshooting.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Troubleshooting Guide
  2 | 
  3 | This guide helps you resolve common issues with BioMCP installation, configuration, and usage.
  4 | 
  5 | ## Installation Issues
  6 | 
  7 | ### Prerequisites Not Met
  8 | 
  9 | **macOS:**
 10 | 
 11 | ```bash
 12 | # Install uv (recommended)
 13 | brew install uv
 14 | 
 15 | # Or using the official installer
 16 | curl -LsSf https://astral.sh/uv/install.sh | sh
 17 | 
 18 | # Install Node.js for npx (if needed)
 19 | brew install node
 20 | ```
 21 | 
 22 | **Linux:**
 23 | 
 24 | ```bash
 25 | # Install uv
 26 | curl -LsSf https://astral.sh/uv/install.sh | sh
 27 | 
 28 | # Install Node.js
 29 | curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash -
 30 | sudo apt-get install -y nodejs
 31 | ```
 32 | 
 33 | **Windows:**
 34 | 
 35 | ```powershell
 36 | # Install uv
 37 | powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
 38 | 
 39 | # Install Node.js from https://nodejs.org
 40 | ```
 41 | 
 42 | ### "Command not found" Error
 43 | 
 44 | After installing BioMCP, if you get "command not found":
 45 | 
 46 | 1. **Restart your terminal** - PATH updates require a new session
 47 | 
 48 | 2. **Check installation location:**
 49 | 
 50 |    ```bash
 51 |    # For uv tool install
 52 |    ls ~/.local/bin/biomcp
 53 | 
 54 |    # For pip install
 55 |    which biomcp
 56 |    ```
 57 | 
 58 | 3. **Add to PATH manually:**
 59 | 
 60 |    ```bash
 61 |    # Add to ~/.bashrc or ~/.zshrc
 62 |    export PATH="$HOME/.local/bin:$PATH"
 63 |    ```
 64 | 
 65 | 4. **Reinstall with force:**
 66 | 
 67 |    ```bash
 68 |    uv tool install biomcp --force
 69 |    ```
 70 | 
 71 | 5. **Use full path:**
 72 |    ```bash
 73 |    ~/.local/bin/biomcp --version
 74 |    ```
 75 | 
 76 | ### Python Version Issues
 77 | 
 78 | BioMCP requires Python 3.10 or higher:
 79 | 
 80 | ```bash
 81 | # Check Python version
 82 | python --version
 83 | 
 84 | # If too old, install newer version
 85 | # macOS
 86 | brew install [email protected]
 87 | 
 88 | # Linux
 89 | sudo apt update
 90 | sudo apt install python3.11
 91 | 
 92 | # Use pyenv for version management
 93 | pyenv install 3.11.8
 94 | pyenv local 3.11.8
 95 | ```
 96 | 
 97 | ## Configuration Issues
 98 | 
 99 | ### API Key Not Working
100 | 
101 | **Environment Variable Not Set:**
102 | 
103 | ```bash
104 | # Check if set
105 | echo $NCI_API_KEY
106 | 
107 | # Set temporarily
108 | export NCI_API_KEY="your-key-here"
109 | 
110 | # Set permanently in ~/.bashrc or ~/.zshrc
111 | echo 'export NCI_API_KEY="your-key-here"' >> ~/.bashrc
112 | source ~/.bashrc
113 | ```
114 | 
115 | **Wrong API Key Format:**
116 | 
117 | - NCI keys: Should be 36 characters (UUID format)
118 | - AlphaGenome: Alphanumeric string
119 | - cBioPortal: JWT token format
120 | 
121 | **API Key Permissions:**
122 | 
123 | ```bash
124 | # Test NCI API key
125 | biomcp health check --verbose
126 | 
127 | # Test specific API
128 | curl -H "X-API-KEY: $NCI_API_KEY" \
129 |   "https://cts.nlm.nih.gov/api/v2/trials?size=1"
130 | ```
131 | 
132 | ### SSL Certificate Errors
133 | 
134 | **Update certificates:**
135 | 
136 | ```bash
137 | # Python certificates
138 | pip install --upgrade certifi
139 | 
140 | # System certificates (macOS)
141 | brew install ca-certificates
142 | 
143 | # System certificates (Linux)
144 | sudo apt-get update
145 | sudo apt-get install ca-certificates
146 | ```
147 | 
148 | **Corporate proxy issues:**
149 | 
150 | ```bash
151 | # Set proxy environment variables
152 | export HTTP_PROXY="http://proxy.company.com:8080"
153 | export HTTPS_PROXY="http://proxy.company.com:8080"
154 | export NO_PROXY="localhost,127.0.0.1"
155 | 
156 | # Configure pip for proxy
157 | pip config set global.proxy http://proxy.company.com:8080
158 | ```
159 | 
160 | ## Search Issues
161 | 
162 | ### No Results Found
163 | 
164 | **1. Check gene symbol:**
165 | 
166 | ```bash
167 | # Wrong: common names
168 | biomcp article search --gene HER2  # ❌
169 | 
170 | # Correct: official HGNC symbol
171 | biomcp article search --gene ERBB2  # ✅
172 | 
173 | # Find correct symbol
174 | biomcp gene get HER2  # Will suggest ERBB2
175 | ```
176 | 
177 | **2. Too restrictive filters:**
178 | 
179 | ```bash
180 | # Too specific - may return nothing
181 | biomcp article search --gene BRAF --disease "stage IV melanoma" \
182 |   --chemical "dabrafenib and trametinib combination"
183 | 
184 | # Better - broader search
185 | biomcp article search --gene BRAF --disease melanoma \
186 |   --keyword "dabrafenib trametinib"
187 | ```
188 | 
189 | **3. Check data availability:**
190 | 
191 | ```bash
192 | # Test if gene exists in database
193 | biomcp gene get YOUR_GENE
194 | 
195 | # Test if disease term is recognized
196 | biomcp disease get "your disease term"
197 | ```
198 | 
199 | ### Location Search Not Working
200 | 
201 | Location searches require coordinates:
202 | 
203 | ```bash
204 | # Wrong - city name only
205 | biomcp trial search --condition cancer --city "New York"  # ❌
206 | 
207 | # Correct - with coordinates
208 | biomcp trial search --condition cancer \
209 |   --latitude 40.7128 --longitude -74.0060 --distance 50  # ✅
210 | ```
211 | 
212 | Common coordinates:
213 | 
214 | - New York: 40.7128, -74.0060
215 | - Los Angeles: 34.0522, -118.2437
216 | - Chicago: 41.8781, -87.6298
217 | - Houston: 29.7604, -95.3698
218 | - Boston: 42.3601, -71.0589
219 | 
220 | ### Preprint Search Issues
221 | 
222 | **Preprints not appearing:**
223 | 
224 | ```bash
225 | # Check if preprints are being excluded
226 | biomcp article search --gene BRAF --no-preprints  # Excludes preprints
227 | 
228 | # Include preprints (default)
229 | biomcp article search --gene BRAF  # Includes preprints
230 | ```
231 | 
232 | **DOI not found:**
233 | 
234 | ```bash
235 | # Ensure correct DOI format
236 | biomcp article get "10.1101/2024.01.20.23288905"  # bioRxiv format
237 | 
238 | # Not all preprints are indexed immediately
239 | # Try searching by title/keywords instead
240 | ```
241 | 
242 | ## Performance Issues
243 | 
244 | ### Slow Searches
245 | 
246 | **1. Reduce result count:**
247 | 
248 | ```bash
249 | # Default may be too high
250 | biomcp article search --gene TP53 --limit 100  # Slow
251 | 
252 | # Reduce for faster results
253 | biomcp article search --gene TP53 --limit 10   # Fast
254 | ```
255 | 
256 | **2. Use specific filters:**
257 | 
258 | ```bash
259 | # Broad search - slow
260 | biomcp trial search --condition cancer
261 | 
262 | # Specific search - faster
263 | biomcp trial search --condition "melanoma" --phase PHASE3 \
264 |   --status RECRUITING --country "United States"
265 | ```
266 | 
267 | **3. Check API health:**
268 | 
269 | ```bash
270 | # See which APIs are slow
271 | biomcp health check --verbose
272 | 
273 | # Check specific API
274 | biomcp health check --apis-only
275 | ```
276 | 
277 | ### Timeout Errors
278 | 
279 | **Increase timeout for slow networks:**
280 | 
281 | ```bash
282 | # Set environment variable
283 | export BIOMCP_TIMEOUT=300  # 5 minutes
284 | 
285 | # Or use configuration file
286 | echo "timeout: 300" > ~/.biomcp/config.yml
287 | ```
288 | 
289 | **For specific operations:**
290 | 
291 | ```python
292 | # In Python scripts
293 | import asyncio
294 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
295 | ```
296 | 
297 | ### Memory Issues
298 | 
299 | **Large result sets:**
300 | 
301 | ```bash
302 | # Process in batches
303 | for i in {1..10}; do
304 |   biomcp article search --gene BRCA1 --page $i --limit 100
305 | done
306 | 
307 | # Use streaming where available
308 | biomcp article search --gene TP53 --format jsonl | \
309 |   while read line; do
310 |     echo "$line" | jq '.pmid'
311 |   done
312 | ```
313 | 
314 | ## MCP Server Issues
315 | 
316 | ### Testing Server Connectivity
317 | 
318 | **1. Test with MCP Inspector:**
319 | 
320 | ```bash
321 | npx @modelcontextprotocol/inspector uv run --with biomcp-python biomcp run
322 | ```
323 | 
324 | Open http://127.0.0.1:6274 and verify:
325 | 
326 | - Tools list loads
327 | - Can invoke a simple tool like `gene_getter`
328 | 
329 | **2. Test with curl (HTTP mode):**
330 | 
331 | ```bash
332 | # Start server in HTTP mode
333 | biomcp run --mode http --port 8000
334 | 
335 | # Test health endpoint
336 | curl http://localhost:8000/health
337 | 
338 | # Test MCP endpoint
339 | curl -X POST http://localhost:8000/mcp \
340 |   -H "Content-Type: application/json" \
341 |   -d '{"method": "tools/list"}'
342 | ```
343 | 
344 | ### Claude Desktop Integration Issues
345 | 
346 | **Server not appearing:**
347 | 
348 | 1. Check configuration file location:
349 | 
350 |    - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
351 |    - Windows: `%APPDATA%\Claude\claude_desktop_config.json`
352 | 
353 | 2. Validate JSON syntax:
354 | 
355 |    ```bash
356 |    # macOS
357 |    cat ~/Library/Application\ Support/Claude/claude_desktop_config.json | jq .
358 |    ```
359 | 
360 | 3. Check server starts correctly:
361 |    ```bash
362 |    # Test the exact command from config
363 |    uv run --with biomcp-python biomcp run
364 |    ```
365 | 
366 | **Server crashes:**
367 | Check logs:
368 | 
369 | ```bash
370 | # Enable debug logging
371 | export BIOMCP_LOG_LEVEL=DEBUG
372 | uv run --with biomcp-python biomcp run
373 | ```
374 | 
375 | Common fixes:
376 | 
377 | - Update to latest version: `uv tool install biomcp --force`
378 | - Clear cache: `rm -rf ~/.biomcp/cache`
379 | - Check port conflicts: `lsof -i :8000`
380 | 
381 | ## Data Quality Issues
382 | 
383 | ### Outdated Results
384 | 
385 | **Check data freshness:**
386 | 
387 | ```bash
388 | # See when databases were last updated
389 | biomcp health check --verbose | grep "Last updated"
390 | ```
391 | 
392 | **Clear cache if needed:**
393 | 
394 | ```bash
395 | # Remove cached results
396 | rm -rf ~/.biomcp/cache
397 | 
398 | # Or set cache TTL
399 | export BIOMCP_CACHE_TTL=900  # 15 minutes
400 | ```
401 | 
402 | ### Missing Annotations
403 | 
404 | **PubTator3 annotations missing:**
405 | 
406 | - Some newer articles may not be fully annotated yet
407 | - Try searching by PMID directly
408 | - Check if article is indexed: search by title
409 | 
410 | **Variant annotations incomplete:**
411 | 
412 | - Not all variants have all annotation types
413 | - Rare variants may lack population frequencies
414 | - Novel variants won't have ClinVar data
415 | 
416 | ## Error Messages
417 | 
418 | ### Common Error Codes
419 | 
420 | **HTTP 429 - Rate Limit Exceeded:**
421 | 
422 | ```bash
423 | # Add delay between requests
424 | biomcp article search --gene BRAF --delay 1000  # 1 second
425 | 
426 | # Or reduce parallel requests
427 | export BIOMCP_MAX_CONCURRENT=2
428 | ```
429 | 
430 | **HTTP 404 - Not Found:**
431 | 
432 | - Check identifier format (PMID, NCT ID, etc.)
433 | - Verify record exists in source database
434 | - Try alternative identifiers
435 | 
436 | **HTTP 500 - Server Error:**
437 | 
438 | - External API may be down
439 | - Check status: `biomcp health check`
440 | - Try again later
441 | 
442 | ### Debugging
443 | 
444 | **Enable verbose logging:**
445 | 
446 | ```bash
447 | # Set log level
448 | export BIOMCP_LOG_LEVEL=DEBUG
449 | 
450 | # Run with verbose output
451 | biomcp article search --gene BRAF --verbose
452 | 
453 | # Check log files
454 | tail -f ~/.biomcp/logs/biomcp.log
455 | ```
456 | 
457 | **Report bugs:**
458 | Include when reporting issues:
459 | 
460 | 1. BioMCP version: `biomcp --version`
461 | 2. Full error message and stack trace
462 | 3. Command that caused the error
463 | 4. Operating system and Python version
464 | 5. Relevant environment variables
465 | 
466 | Report at: https://github.com/genomoncology/biomcp/issues
467 | 
468 | ## Getting Help
469 | 
470 | ### Quick Checks
471 | 
472 | 1. **Check FAQ first**: [Frequently Asked Questions](faq-condensed.md)
473 | 2. **Search existing issues**: [GitHub Issues](https://github.com/genomoncology/biomcp/issues)
474 | 3. **Check examples**: [How-to Guides](how-to-guides/01-find-articles-and-cbioportal-data.md)
475 | 
476 | ### Community Support
477 | 
478 | - Issue Tracker: Report bugs, request features
479 | - Documentation: PRs welcome for improvements
480 | 
481 | ### Professional Support
482 | 
483 | For commercial support, contact: [email protected]
484 | 
485 | ---
486 | 
487 | _Still having issues? [Open a GitHub issue](https://github.com/genomoncology/biomcp/issues/new) with details._
488 | 
```

--------------------------------------------------------------------------------
/tests/tdd/variants/test_external_integration.py:
--------------------------------------------------------------------------------

```python
  1 | """Integration tests for external variant data sources with real API calls."""
  2 | 
  3 | import pytest
  4 | 
  5 | from biomcp.variants.cbio_external_client import CBioPortalExternalClient
  6 | from biomcp.variants.external import (
  7 |     ExternalVariantAggregator,
  8 |     TCGAClient,
  9 |     ThousandGenomesClient,
 10 | )
 11 | 
 12 | 
 13 | class TestTCGAIntegration:
 14 |     """Integration tests for TCGA/GDC API."""
 15 | 
 16 |     @pytest.mark.asyncio
 17 |     @pytest.mark.integration
 18 |     async def test_braf_v600e_variant(self):
 19 |         """Test fetching BRAF V600E data from TCGA."""
 20 |         client = TCGAClient()
 21 | 
 22 |         # Try different formats
 23 |         variants_to_test = [
 24 |             "BRAF V600E",  # Gene AA change format that TCGA supports
 25 |             "chr7:g.140453136A>T",
 26 |             "7:g.140453136A>T",
 27 |         ]
 28 | 
 29 |         found_data = False
 30 |         for variant in variants_to_test:
 31 |             result = await client.get_variant_data(variant)
 32 |             if result:
 33 |                 found_data = True
 34 |                 # BRAF V600E is common in melanoma and thyroid cancer
 35 |                 assert result.tumor_types is not None
 36 |                 assert len(result.tumor_types) > 0
 37 |                 # Should have affected cases if data found
 38 |                 if result.affected_cases:
 39 |                     assert result.affected_cases > 0
 40 |                 break
 41 | 
 42 |         # Note: TCGA might not have data for all variants
 43 |         if not found_data:
 44 |             pytest.skip("TCGA API did not return data for BRAF V600E variants")
 45 | 
 46 |     @pytest.mark.asyncio
 47 |     @pytest.mark.integration
 48 |     async def test_tp53_variant(self):
 49 |         """Test fetching TP53 variant data from TCGA."""
 50 |         client = TCGAClient()
 51 | 
 52 |         # TP53 R273H - common tumor suppressor mutation
 53 |         result = await client.get_variant_data("chr17:g.7577120G>A")
 54 | 
 55 |         # TP53 mutations are very common in cancer
 56 |         if result:
 57 |             assert result.tumor_types is not None
 58 |             assert len(result.tumor_types) > 0
 59 | 
 60 |     @pytest.mark.asyncio
 61 |     @pytest.mark.integration
 62 |     async def test_nonexistent_variant(self):
 63 |         """Test TCGA response for non-existent variant."""
 64 |         client = TCGAClient()
 65 | 
 66 |         # Made-up variant that shouldn't exist
 67 |         result = await client.get_variant_data("chr99:g.999999999A>T")
 68 | 
 69 |         assert result is None
 70 | 
 71 | 
 72 | class TestThousandGenomesIntegration:
 73 |     """Integration tests for 1000 Genomes via Ensembl REST API."""
 74 | 
 75 |     @pytest.mark.asyncio
 76 |     @pytest.mark.integration
 77 |     async def test_common_variant_with_rsid(self):
 78 |         """Test fetching common variant data by rsID."""
 79 |         client = ThousandGenomesClient()
 80 | 
 81 |         # rs113488022 is BRAF V600E
 82 |         result = await client.get_variant_data("rs113488022")
 83 | 
 84 |         if result:
 85 |             # This is a rare variant, so MAF should be low or None
 86 |             if result.global_maf is not None:
 87 |                 assert result.global_maf < 0.01  # Less than 1%
 88 | 
 89 |             # Consequence information might not be available for all variants
 90 |             # Just verify the data structure is correct
 91 |             assert hasattr(result, "most_severe_consequence")
 92 | 
 93 |     @pytest.mark.asyncio
 94 |     @pytest.mark.integration
 95 |     async def test_variant_population_frequencies(self):
 96 |         """Test population frequency data retrieval."""
 97 |         client = ThousandGenomesClient()
 98 | 
 99 |         # Use a more common variant for testing population frequencies
100 |         # rs1800734 - common variant in MLH1 promoter
101 |         result = await client.get_variant_data("rs1800734")
102 | 
103 |         if result:
104 |             # Should have at least global MAF
105 |             assert result.global_maf is not None
106 |             assert 0 <= result.global_maf <= 1
107 | 
108 |             # Check that we get population-specific frequencies
109 |             pop_freqs = [
110 |                 result.afr_maf,
111 |                 result.amr_maf,
112 |                 result.eas_maf,
113 |                 result.eur_maf,
114 |                 result.sas_maf,
115 |             ]
116 | 
117 |             # At least some populations should have data
118 |             non_null_freqs = [f for f in pop_freqs if f is not None]
119 |             assert len(non_null_freqs) > 0
120 | 
121 |     @pytest.mark.asyncio
122 |     @pytest.mark.integration
123 |     async def test_invalid_variant_id(self):
124 |         """Test 1000 Genomes response for invalid variant."""
125 |         client = ThousandGenomesClient()
126 | 
127 |         # Invalid rsID
128 |         result = await client.get_variant_data("rs999999999999")
129 | 
130 |         assert result is None
131 | 
132 | 
133 | class TestCBioPortalIntegration:
134 |     """Integration tests for cBioPortal API."""
135 | 
136 |     @pytest.mark.asyncio
137 |     @pytest.mark.integration
138 |     async def test_braf_v600e_variant(self):
139 |         """Test fetching BRAF V600E data from cBioPortal."""
140 |         client = CBioPortalExternalClient()
141 | 
142 |         result = await client.get_variant_data("BRAF V600E")
143 | 
144 |         if result:
145 |             # BRAF V600E is common in melanoma and other cancers
146 |             assert result.total_cases is not None
147 |             assert result.total_cases > 0
148 |             assert len(result.studies) > 0
149 |             # Should have data from various studies
150 |             print(
151 |                 f"Found {result.total_cases} cases in {len(result.studies)} studies: {result.studies}"
152 |             )
153 | 
154 |             # Check enhanced fields
155 |             assert result.cancer_type_distribution is not None
156 |             assert len(result.cancer_type_distribution) > 0
157 |             print(
158 |                 f"Cancer types: {list(result.cancer_type_distribution.keys())}"
159 |             )
160 | 
161 |             assert result.mutation_types is not None
162 |             assert "Missense_Mutation" in result.mutation_types
163 | 
164 |             assert result.mean_vaf is not None
165 |             print(f"Mean VAF: {result.mean_vaf}")
166 |         else:
167 |             pytest.skip("cBioPortal API did not return data for BRAF V600E")
168 | 
169 |     @pytest.mark.asyncio
170 |     @pytest.mark.integration
171 |     async def test_kras_g12d_variant(self):
172 |         """Test fetching KRAS G12D data from cBioPortal."""
173 |         client = CBioPortalExternalClient()
174 | 
175 |         result = await client.get_variant_data("KRAS G12D")
176 | 
177 |         if result:
178 |             # KRAS G12D is a common mutation in multiple cancer types
179 |             assert result.total_cases is not None
180 |             assert result.total_cases > 0
181 |             assert len(result.studies) > 0
182 |         else:
183 |             pytest.skip("cBioPortal API did not return data for KRAS G12D")
184 | 
185 |     @pytest.mark.asyncio
186 |     @pytest.mark.integration
187 |     async def test_invalid_variant(self):
188 |         """Test cBioPortal response for invalid variant."""
189 |         client = CBioPortalExternalClient()
190 | 
191 |         # Invalid gene name
192 |         result = await client.get_variant_data("FAKEGENE V600E")
193 | 
194 |         assert result is None
195 | 
196 | 
197 | class TestExternalVariantAggregatorIntegration:
198 |     """Integration tests for the external variant aggregator."""
199 | 
200 |     @pytest.mark.asyncio
201 |     @pytest.mark.integration
202 |     async def test_aggregate_all_sources(self):
203 |         """Test aggregating data from all available sources."""
204 |         aggregator = ExternalVariantAggregator()
205 | 
206 |         # Use rs1045642 which is a common variant that should have 1000 Genomes data
207 |         # Also provide variant data for cBioPortal
208 |         variant_data = {
209 |             "cadd": {"gene": {"genename": "ABCB1"}},
210 |             "docm": {"aa_change": "p.I1145I"},
211 |         }
212 | 
213 |         result = await aggregator.get_enhanced_annotations(
214 |             "rs1045642",
215 |             include_tcga=True,
216 |             include_1000g=True,
217 |             include_cbioportal=True,
218 |             variant_data=variant_data,
219 |         )
220 | 
221 |         assert result.variant_id == "rs1045642"
222 | 
223 |         # Check which sources returned data
224 |         sources_with_data = []
225 |         if result.tcga:
226 |             sources_with_data.append("tcga")
227 |         if result.thousand_genomes:
228 |             sources_with_data.append("1000g")
229 |         if result.cbioportal:
230 |             sources_with_data.append("cbioportal")
231 | 
232 |         # This common variant should have at least 1000 Genomes data
233 |         assert len(sources_with_data) > 0
234 |         # Specifically, it should have 1000 Genomes data
235 |         assert result.thousand_genomes is not None
236 | 
237 |         # No errors should be reported for successful queries
238 |         # (though some sources might not have data, which is different from errors)
239 |         assert len(result.error_sources) == 0
240 | 
241 |     @pytest.mark.asyncio
242 |     @pytest.mark.integration
243 |     async def test_selective_source_inclusion(self):
244 |         """Test including only specific sources."""
245 |         aggregator = ExternalVariantAggregator()
246 | 
247 |         # Only request 1000 Genomes data
248 |         result = await aggregator.get_enhanced_annotations(
249 |             "rs1800734",  # Common variant
250 |             include_tcga=False,
251 |             include_1000g=True,
252 |         )
253 | 
254 |         # Should only attempt to fetch 1000 Genomes data
255 |         assert result.tcga is None
256 |         # 1000 Genomes might have data for this common variant
257 |         # (but it's okay if it doesn't)
258 | 
259 |     @pytest.mark.asyncio
260 |     @pytest.mark.integration
261 |     async def test_error_handling_resilience(self):
262 |         """Test that aggregator handles individual source failures gracefully."""
263 |         aggregator = ExternalVariantAggregator()
264 | 
265 |         # Use an invalid variant format that might cause errors
266 |         result = await aggregator.get_enhanced_annotations(
267 |             "INVALID_VARIANT_FORMAT_12345",
268 |             include_tcga=True,
269 |             include_1000g=True,
270 |         )
271 | 
272 |         # Should still return a result even if all sources fail
273 |         assert result is not None
274 |         assert result.variant_id == "INVALID_VARIANT_FORMAT_12345"
275 | 
276 |         # Sources should return None or be in error_sources
277 |         assert result.tcga is None
278 |         assert result.thousand_genomes is None
279 | 
```

--------------------------------------------------------------------------------
/docs/tutorials/biothings-prompts.md:
--------------------------------------------------------------------------------

```markdown
  1 | # BioThings Integration Example Prompts
  2 | 
  3 | This guide provides example prompts for AI assistants to effectively use the BioThings suite integration in BioMCP.
  4 | 
  5 | ## Overview of BioThings Suite
  6 | 
  7 | BioMCP integrates with the complete BioThings suite of APIs:
  8 | 
  9 | - **MyGene.info** - Gene information and annotations
 10 | - **MyDisease.info** - Disease ontology and synonyms
 11 | - **MyVariant.info** - Genetic variant annotations (pre-existing integration, enhanced with BioThings client)
 12 | - **MyChem.info** - Drug/chemical information and annotations
 13 | 
 14 | All four services share common infrastructure through the BioThings client module, providing consistent error handling, rate limiting, and response parsing.
 15 | 
 16 | ## Gene Information Retrieval
 17 | 
 18 | ### Basic Gene Lookup
 19 | 
 20 | ```
 21 | "What is the TP53 gene?"
 22 | "Tell me about BRAF"
 23 | "Get information on the EGFR gene"
 24 | "What does the BRCA1 gene do?"
 25 | ```
 26 | 
 27 | **Expected tool usage**: `gene_getter("TP53")` → Returns official name, summary, aliases
 28 | 
 29 | ### Gene by ID
 30 | 
 31 | ```
 32 | "Look up gene with Entrez ID 7157"
 33 | "What is gene 673?"
 34 | ```
 35 | 
 36 | **Expected tool usage**: `gene_getter("7157")` → Returns TP53 information
 37 | 
 38 | ### Gene Context for Research
 39 | 
 40 | ```
 41 | "I need to understand the KRAS gene before searching for mutations"
 42 | "What type of protein does BRAF encode?"
 43 | "Give me the official name and aliases for MYC"
 44 | ```
 45 | 
 46 | ## Disease Information Retrieval
 47 | 
 48 | ### Basic Disease Lookup
 49 | 
 50 | ```
 51 | "What is GIST?"
 52 | "Tell me about melanoma"
 53 | "Define non-small cell lung cancer"
 54 | "What is Erdheim-Chester disease?"
 55 | ```
 56 | 
 57 | **Expected tool usage**: `disease_getter("GIST")` → Returns definition, synonyms, ontology IDs
 58 | 
 59 | ### Disease by Ontology ID
 60 | 
 61 | ```
 62 | "Look up disease MONDO:0018076"
 63 | "What is DOID:1909?"
 64 | ```
 65 | 
 66 | **Expected tool usage**: `disease_getter("MONDO:0018076")` → Returns disease information
 67 | 
 68 | ### Disease Synonyms for Research
 69 | 
 70 | ```
 71 | "What are all the names for gastrointestinal stromal tumor?"
 72 | "Find synonyms for NSCLC"
 73 | "What other terms are used for melanoma?"
 74 | ```
 75 | 
 76 | ## Variant Information Retrieval (MyVariant.info)
 77 | 
 78 | MyVariant.info is part of the BioThings suite and provides comprehensive variant annotations. BioMCP has extensive integration with specialized features:
 79 | 
 80 | ### Basic Variant Lookup
 81 | 
 82 | ```
 83 | "Get information about rs7412"
 84 | "What is the BRAF V600E variant?"
 85 | "Look up variant chr7:140453136-140453136"
 86 | ```
 87 | 
 88 | **Expected tool usage**: `variant_getter("rs7412")` → Returns variant annotations with external database links
 89 | 
 90 | ### Variant Search with Filters
 91 | 
 92 | ```
 93 | "Find pathogenic variants in TP53"
 94 | "Search for BRCA1 variants with high impact"
 95 | "Get all loss-of-function variants in KRAS"
 96 | ```
 97 | 
 98 | **Expected tool usage**: `variant_searcher(gene="TP53", significance="pathogenic")` → Returns filtered variant list
 99 | 
100 | ### Variant with Cancer Context
101 | 
102 | ```
103 | "What cancer types have BRAF V600E mutations?"
104 | "Get TCGA data for TP53 R273H"
105 | ```
106 | 
107 | **Expected tool usage**: Variant tools automatically integrate cBioPortal, TCGA, and 1000 Genomes data when available
108 | 
109 | ## Drug Information Retrieval (MyChem.info)
110 | 
111 | MyChem.info is part of the BioThings suite and provides comprehensive drug/chemical information.
112 | 
113 | ### Basic Drug Lookup
114 | 
115 | ```
116 | "What is imatinib?"
117 | "Tell me about aspirin"
118 | "Get information on pembrolizumab"
119 | "What does metformin do?"
120 | ```
121 | 
122 | **Expected tool usage**: `drug_getter("imatinib")` → Returns drug information with database links
123 | 
124 | ### Drug by ID
125 | 
126 | ```
127 | "Look up DrugBank ID DB00619"
128 | "What is CHEMBL941?"
129 | "Get details for PubChem CID 5291"
130 | ```
131 | 
132 | **Expected tool usage**: `drug_getter("DB00619")` → Returns drug details by identifier
133 | 
134 | ### Drug Properties and Mechanism
135 | 
136 | ```
137 | "What is the mechanism of action of imatinib?"
138 | "Find the chemical formula for aspirin"
139 | "What are the trade names for adalimumab?"
140 | "How does pembrolizumab work?"
141 | ```
142 | 
143 | **Expected tool usage**: `drug_getter("pembrolizumab")` → Returns mechanism, indications, and properties
144 | 
145 | ## Integrated Research Workflows
146 | 
147 | ### Variant Analysis with Gene Context
148 | 
149 | ```
150 | "Analyze the BRAF V600E mutation - first tell me about the gene, then find pathogenic variants"
151 | ```
152 | 
153 | **Expected tool sequence**:
154 | 
155 | 1. `think(thought="Analyzing BRAF V600E mutation", thoughtNumber=1)`
156 | 2. `gene_getter("BRAF")` → Gene context
157 | 3. `variant_searcher(gene="BRAF", hgvsp="V600E", significance="pathogenic")` → Variant details
158 | 
159 | ### Clinical Trial Search with Disease Expansion
160 | 
161 | ```
162 | "Find clinical trials for GIST patients"
163 | "Search for trials treating gastrointestinal stromal tumors"
164 | ```
165 | 
166 | **Expected tool usage**:
167 | 
168 | - `trial_searcher(conditions=["GIST"], expand_synonyms=True)`
169 | - Automatically searches for: GIST OR "gastrointestinal stromal tumor" OR "GI stromal tumor"
170 | 
171 | ### Comprehensive Gene-Disease Research
172 | 
173 | ```
174 | "I'm researching EGFR mutations in lung cancer. Start with the gene, then the disease, then find relevant trials"
175 | ```
176 | 
177 | **Expected tool sequence**:
178 | 
179 | 1. `think(thought="Researching EGFR in lung cancer", thoughtNumber=1)`
180 | 2. `gene_getter("EGFR")` → Gene information
181 | 3. `disease_getter("lung cancer")` → Disease context and synonyms
182 | 4. `trial_searcher(conditions=["lung cancer"], interventions=["EGFR inhibitor"])` → Trials with synonym expansion
183 | 
184 | ### Multi-Gene Analysis
185 | 
186 | ```
187 | "Compare TP53, BRAF, and KRAS genes"
188 | "Tell me about the RAS family genes: KRAS, NRAS, HRAS"
189 | ```
190 | 
191 | **Expected tool usage**: Multiple `gene_getter()` calls for each gene
192 | 
193 | ## Advanced Use Cases
194 | 
195 | ### Gene Alias Resolution
196 | 
197 | ```
198 | "What is the official name for the p53 gene?"
199 | "Is TRP53 the same as TP53?"
200 | ```
201 | 
202 | **Expected tool usage**: `gene_getter("p53")` → Will resolve to TP53
203 | 
204 | ### Disease Name Disambiguation
205 | 
206 | ```
207 | "Is GIST the same as gastrointestinal stromal tumor?"
208 | "What's the MONDO ID for melanoma?"
209 | ```
210 | 
211 | **Expected tool usage**: `disease_getter("GIST")` → Shows all synonyms and IDs
212 | 
213 | ### Trial Search Without Synonym Expansion
214 | 
215 | ```
216 | "Find trials specifically mentioning 'GIST' not other names"
217 | ```
218 | 
219 | **Expected tool usage**: `trial_searcher(conditions=["GIST"], expand_synonyms=False)`
220 | 
221 | ### Integrated Literature and Gene Search
222 | 
223 | ```
224 | "Find recent papers about TP53 mutations - first tell me about the gene"
225 | ```
226 | 
227 | **Expected tool sequence**:
228 | 
229 | 1. `gene_getter("TP53")` → Gene context
230 | 2. `article_searcher(genes=["TP53"], keywords=["mutation"])` → Literature
231 | 
232 | ### Drug-Target Research
233 | 
234 | ```
235 | "I'm researching imatinib for CML treatment. Get drug info, then find trials"
236 | "What targets does pembrolizumab hit? Then find related articles"
237 | ```
238 | 
239 | **Expected tool sequence**:
240 | 
241 | 1. `think(thought="Researching imatinib for CML", thoughtNumber=1)`
242 | 2. `drug_getter("imatinib")` → Drug information and mechanism
243 | 3. `trial_searcher(interventions=["imatinib"], conditions=["chronic myeloid leukemia"])`
244 | 
245 | ## Tips for AI Assistants
246 | 
247 | 1. **Always use think() first** for complex biomedical queries
248 | 2. **Gene context helps interpretation**: Get gene info before analyzing variants
249 | 3. **Disease synonyms improve search**: Use expand_synonyms=True (default) for comprehensive results
250 | 4. **Drug mechanisms matter**: Get drug info before searching trials to understand targets
251 | 5. **Real-time data**: All BioThings data is fetched live, ensuring current information
252 | 6. **Combine tools**: Gene + disease + variant + drug tools work together for comprehensive analysis
253 | 
254 | ## Common Patterns
255 | 
256 | ### Pattern 1: Gene → Variant → Clinical Impact
257 | 
258 | ```
259 | gene_getter("BRAF") →
260 | variant_searcher(gene="BRAF", significance="pathogenic") →
261 | article_searcher(genes=["BRAF"], diseases=["melanoma"])
262 | ```
263 | 
264 | ### Pattern 2: Disease → Trials → Locations
265 | 
266 | ```
267 | disease_getter("NSCLC") →
268 | trial_searcher(conditions=["NSCLC"], expand_synonyms=True) →
269 | trial_locations_getter(nct_id="NCT...")
270 | ```
271 | 
272 | ### Pattern 3: Multi-Gene Pathway Analysis
273 | 
274 | ```
275 | gene_getter("EGFR") →
276 | gene_getter("KRAS") →
277 | gene_getter("BRAF") →
278 | article_searcher(genes=["EGFR", "KRAS", "BRAF"], keywords=["pathway"])
279 | ```
280 | 
281 | ## Unified Search with BioThings Domains
282 | 
283 | BioMCP's unified search now supports gene, drug, and disease domains alongside articles, trials, and variants:
284 | 
285 | ### Domain-Specific Search
286 | 
287 | ```
288 | "Search for BRAF in the gene domain"
289 | "Find imatinib in drugs"
290 | "Look up melanoma in diseases"
291 | ```
292 | 
293 | **Expected tool usage**:
294 | 
295 | - `search(domain="gene", keywords=["BRAF"])`
296 | - `search(domain="drug", keywords=["imatinib"])`
297 | - `search(domain="disease", keywords=["melanoma"])`
298 | 
299 | ### Unified Query Language with BioThings
300 | 
301 | ```
302 | "genes.symbol:BRAF AND genes.type:protein-coding"
303 | "drugs.tradename:gleevec"
304 | "diseases.name:melanoma OR diseases.synonym:malignant melanoma"
305 | ```
306 | 
307 | **Expected tool usage**: Query parser automatically routes to appropriate domains
308 | 
309 | ### Cross-Domain Gene Searches
310 | 
311 | ```
312 | "gene:BRAF"  # Searches articles, variants, genes, and trials
313 | "Search everything about TP53"
314 | ```
315 | 
316 | **Expected behavior**:
317 | 
318 | - Gene queries trigger searches across multiple domains
319 | - Results include gene info, variants, articles, and related trials
320 | 
321 | ### Cross-Domain Disease Searches
322 | 
323 | ```
324 | "disease:melanoma"  # Searches articles, trials, and diseases
325 | "Find all information about NSCLC"
326 | ```
327 | 
328 | **Expected behavior**:
329 | 
330 | - Disease queries search articles, trials, and disease databases
331 | - Disease synonyms are automatically expanded in trial searches
332 | 
333 | ### Combined Domain Queries
334 | 
335 | ```
336 | "gene:BRAF AND disease:melanoma"
337 | "drugs.indication:leukemia AND trials.phase:3"
338 | "genes.symbol:EGFR AND articles.year:>2023"
339 | ```
340 | 
341 | ### Unified Fetch
342 | 
343 | ```
344 | "Fetch BRAF from gene domain"
345 | "Get imatinib details from drugs"
346 | "Retrieve melanoma information from diseases"
347 | ```
348 | 
349 | **Expected tool usage**:
350 | 
351 | - `fetch(id="BRAF", domain="gene")`
352 | - `fetch(id="imatinib", domain="drug")`
353 | - `fetch(id="melanoma", domain="disease")`
354 | 
355 | ## Error Handling
356 | 
357 | If a gene/disease is not found:
358 | 
359 | - Check for typos or alternative names
360 | - Try searching with partial names
361 | - Use official symbols for genes (e.g., "TP53" not "p53 gene")
362 | - For diseases, try both common and medical names
363 | 
```

--------------------------------------------------------------------------------
/src/biomcp/constants.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Central constants file for BioMCP.
  3 | 
  4 | This module contains all constants used throughout the BioMCP codebase,
  5 | including API URLs, default values, limits, and domain configurations.
  6 | """
  7 | 
  8 | # ============================================================================
  9 | # API Base URLs
 10 | # ============================================================================
 11 | 
 12 | # PubTator3 API
 13 | # https://www.ncbi.nlm.nih.gov/research/pubtator3/api
 14 | PUBTATOR3_BASE_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api"
 15 | PUBTATOR3_SEARCH_URL = f"{PUBTATOR3_BASE_URL}/search/"
 16 | PUBTATOR3_FULLTEXT_URL = f"{PUBTATOR3_BASE_URL}/publications/export/biocjson"
 17 | PUBTATOR3_AUTOCOMPLETE_URL = f"{PUBTATOR3_BASE_URL}/entity/autocomplete/"
 18 | 
 19 | # ClinicalTrials.gov API
 20 | # https://clinicaltrials.gov/data-api/api
 21 | CLINICAL_TRIALS_BASE_URL = "https://clinicaltrials.gov/api/v2/studies"
 22 | CLINICAL_TRIALS_STUDY_URL = "https://clinicaltrials.gov/study/"
 23 | 
 24 | # NCI Clinical Trials Search API
 25 | # https://clinicaltrialsapi.cancer.gov/api/v2
 26 | NCI_CTS_BASE_URL = "https://clinicaltrialsapi.cancer.gov/api/v2"
 27 | NCI_TRIALS_URL = f"{NCI_CTS_BASE_URL}/trials"
 28 | NCI_ORGANIZATIONS_URL = f"{NCI_CTS_BASE_URL}/organizations"
 29 | NCI_DISEASES_URL = f"{NCI_CTS_BASE_URL}/diseases"
 30 | NCI_INTERVENTIONS_URL = f"{NCI_CTS_BASE_URL}/interventions"
 31 | NCI_BIOMARKERS_URL = f"{NCI_CTS_BASE_URL}/biomarkers"
 32 | NCI_API_KEY_ENV = "NCI_API_KEY"
 33 | 
 34 | # MyVariant.info API
 35 | # https://docs.myvariant.info/
 36 | MYVARIANT_BASE_URL = "https://myvariant.info/v1"
 37 | MYVARIANT_QUERY_URL = f"{MYVARIANT_BASE_URL}/query"
 38 | MYVARIANT_GET_URL = f"{MYVARIANT_BASE_URL}/variant"
 39 | 
 40 | # Preprint Server APIs
 41 | BIORXIV_BASE_URL = "https://api.biorxiv.org/details/biorxiv"
 42 | MEDRXIV_BASE_URL = "https://api.biorxiv.org/details/medrxiv"
 43 | EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
 44 | 
 45 | # External Variant APIs
 46 | GDC_BASE_URL = "https://api.gdc.cancer.gov"
 47 | GDC_SSMS_ENDPOINT_URL = f"{GDC_BASE_URL}/ssms"  # Simple Somatic Mutations
 48 | GDC_SSM_OCCURRENCES_URL = f"{GDC_BASE_URL}/ssm_occurrences"
 49 | ENSEMBL_REST_BASE_URL = "https://rest.ensembl.org"
 50 | ENSEMBL_VARIATION_URL = f"{ENSEMBL_REST_BASE_URL}/variation/human"
 51 | CBIOPORTAL_BASE_URL = "https://www.cbioportal.org/api"
 52 | 
 53 | # External Resource URLs
 54 | PUBMED_BASE_URL = "https://pubmed.ncbi.nlm.nih.gov/"
 55 | PMC_BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
 56 | DOI_BASE_URL = "https://doi.org/"
 57 | DBSNP_BASE_URL = "https://www.ncbi.nlm.nih.gov/snp/"
 58 | CLINVAR_BASE_URL = "https://www.ncbi.nlm.nih.gov/clinvar/variation/"
 59 | COSMIC_BASE_URL = "https://cancer.sanger.ac.uk/cosmic/mutation/overview?id="
 60 | CIVIC_BASE_URL = "https://civicdb.org/variants/"
 61 | ENSEMBL_VARIANT_BASE_URL = (
 62 |     "https://ensembl.org/Homo_sapiens/Variation/Explore?v="
 63 | )
 64 | GENENAMES_BASE_URL = (
 65 |     "https://www.genenames.org/data/gene-symbol-report/#!/symbol/"
 66 | )
 67 | UCSC_GENOME_BROWSER_URL = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&"
 68 | 
 69 | # ============================================================================
 70 | # Default Values and Limits
 71 | # ============================================================================
 72 | 
 73 | # Caching
 74 | DEFAULT_CACHE_TIMEOUT = 60 * 60 * 24 * 7  # 1 week in seconds
 75 | 
 76 | # Pagination
 77 | SYSTEM_PAGE_SIZE = (
 78 |     10  # Default page size for all searches (reduced for token efficiency)
 79 | )
 80 | DEFAULT_PAGE_SIZE = 10  # Default page size for unified search
 81 | MIN_PAGE_SIZE = 1
 82 | MAX_PAGE_SIZE = 100
 83 | DEFAULT_PAGE_NUMBER = 1
 84 | 
 85 | # Search limits
 86 | MAX_RESULTS_PER_DOMAIN_DEFAULT = (
 87 |     10  # Default max results per domain in unified search
 88 | )
 89 | ESTIMATED_ADDITIONAL_RESULTS = (
 90 |     100  # Estimate for additional results when full page returned
 91 | )
 92 | DEFAULT_AUTOCOMPLETE_LIMIT = 1
 93 | MAX_AUTOCOMPLETE_LIMIT = 100
 94 | 
 95 | # Text display
 96 | MAX_WIDTH = 72  # Maximum width for text wrapping in console output
 97 | SNIPPET_LENGTH = 200  # Maximum length for text snippets in search results
 98 | 
 99 | # Genome Assembly
100 | DEFAULT_ASSEMBLY = "hg19"  # Default genome assembly for MyVariant.info API
101 | 
102 | # Rate Limiting
103 | DEFAULT_RATE_LIMIT_PER_SECOND = 10.0
104 | DEFAULT_BURST_SIZE = 20
105 | SLIDING_WINDOW_MINUTE_LIMIT = 60
106 | SLIDING_WINDOW_HOUR_LIMIT = 1000
107 | 
108 | # Retry Configuration
109 | DEFAULT_MAX_RETRY_ATTEMPTS = 3
110 | DEFAULT_INITIAL_RETRY_DELAY = 1.0
111 | DEFAULT_MAX_RETRY_DELAY = 60.0
112 | DEFAULT_EXPONENTIAL_BASE = 2.0
113 | AGGRESSIVE_MAX_RETRY_ATTEMPTS = 5
114 | AGGRESSIVE_INITIAL_RETRY_DELAY = 2.0
115 | AGGRESSIVE_MAX_RETRY_DELAY = 30.0
116 | 
117 | # Circuit Breaker Configuration
118 | DEFAULT_FAILURE_THRESHOLD = 10
119 | DEFAULT_RECOVERY_TIMEOUT = 30.0
120 | DEFAULT_SUCCESS_THRESHOLD = 3
121 | 
122 | # Metrics Configuration
123 | MAX_METRIC_SAMPLES = 1000
124 | METRIC_PERCENTILE_50 = 0.50
125 | METRIC_PERCENTILE_95 = 0.95
126 | METRIC_PERCENTILE_99 = 0.99
127 | METRIC_JITTER_RANGE = 0.1  # 10% jitter
128 | 
129 | # HTTP Client Configuration
130 | HTTP_TIMEOUT_SECONDS = 120.0
131 | HTTP_ERROR_CODE_NETWORK = 599
132 | HTTP_ERROR_CODE_UNSUPPORTED_METHOD = 405
133 | 
134 | # Batching and Pagination Configuration
135 | DEFAULT_BATCH_SIZE = 10
136 | DEFAULT_BATCH_TIMEOUT = 0.1
137 | CBIOPORTAL_BATCH_SIZE = 5
138 | EUROPE_PMC_PAGE_SIZE = 25
139 | BIORXIV_MAX_PAGES = 3
140 | BIORXIV_RESULTS_PER_PAGE = 30
141 | BIORXIV_DEFAULT_DAYS_BACK = 365
142 | 
143 | # Prefetching Configuration
144 | PREFETCH_TOP_GENES = 5
145 | PREFETCH_TOP_DISEASES = 3
146 | PREFETCH_TOP_CHEMICALS = 3
147 | PREFETCH_TIMEOUT = 2.0
148 | 
149 | # Cache Configuration
150 | REQUEST_CACHE_MAX_SIZE = 1000
151 | CACHE_KEY_SAMPLE_SIZE = 100
152 | 
153 | # Connection Pool Configuration
154 | CONNECTION_POOL_MAX_KEEPALIVE = 20
155 | CONNECTION_POOL_MAX_CONNECTIONS = 100
156 | CONNECTION_POOL_KEEPALIVE_EXPIRY = 30
157 | 
158 | # ============================================================================
159 | # Domain Configuration
160 | # ============================================================================
161 | 
162 | # Valid domains for search
163 | VALID_DOMAINS = [
164 |     "article",
165 |     "trial",
166 |     "variant",
167 |     "gene",
168 |     "drug",
169 |     "disease",
170 |     "nci_organization",
171 |     "nci_intervention",
172 |     "nci_biomarker",
173 |     "nci_disease",
174 |     # OpenFDA domains
175 |     "fda_adverse",
176 |     "fda_label",
177 |     "fda_device",
178 |     "fda_approval",
179 |     "fda_recall",
180 |     "fda_shortage",
181 | ]
182 | VALID_DOMAINS_PLURAL = [
183 |     "articles",
184 |     "trials",
185 |     "variants",
186 |     "genes",
187 |     "drugs",
188 |     "diseases",
189 |     "nci_organizations",
190 |     "nci_interventions",
191 |     "nci_biomarkers",
192 |     "nci_diseases",
193 |     # OpenFDA domains
194 |     "fda_adverse_events",
195 |     "fda_labels",
196 |     "fda_device_events",
197 |     "fda_approvals",
198 |     "fda_recalls",
199 |     "fda_shortages",
200 | ]
201 | 
202 | # Domain mappings for unified search
203 | DOMAIN_TO_PLURAL = {
204 |     "article": "articles",
205 |     "trial": "trials",
206 |     "variant": "variants",
207 |     "gene": "genes",
208 |     "drug": "drugs",
209 |     "disease": "diseases",
210 |     "nci_organization": "nci_organizations",
211 |     "nci_intervention": "nci_interventions",
212 |     "nci_biomarker": "nci_biomarkers",
213 |     "nci_disease": "nci_diseases",
214 |     # OpenFDA domains
215 |     "fda_adverse": "fda_adverse_events",
216 |     "fda_label": "fda_labels",
217 |     "fda_device": "fda_device_events",
218 |     "fda_approval": "fda_approvals",
219 |     "fda_recall": "fda_recalls",
220 |     "fda_shortage": "fda_shortages",
221 | }
222 | 
223 | PLURAL_TO_DOMAIN = {
224 |     "articles": "article",
225 |     "trials": "trial",
226 |     "variants": "variant",
227 |     "genes": "gene",
228 |     "drugs": "drug",
229 |     "diseases": "disease",
230 |     "nci_organizations": "nci_organization",
231 |     "nci_interventions": "nci_intervention",
232 |     "nci_biomarkers": "nci_biomarker",
233 |     "nci_diseases": "nci_disease",
234 |     # OpenFDA domains
235 |     "fda_adverse_events": "fda_adverse",
236 |     "fda_labels": "fda_label",
237 |     "fda_device_events": "fda_device",
238 |     "fda_approvals": "fda_approval",
239 |     "fda_recalls": "fda_recall",
240 |     "fda_shortages": "fda_shortage",
241 | }
242 | 
243 | # Trial detail sections
244 | TRIAL_DETAIL_SECTIONS = [
245 |     "protocol",
246 |     "locations",
247 |     "outcomes",
248 |     "references",
249 |     "all",
250 |     "full",
251 | ]
252 | 
253 | # ============================================================================
254 | # Field Names and Enums
255 | # ============================================================================
256 | 
257 | # Autocomplete concept types
258 | AUTOCOMPLETE_CONCEPTS = ["variant", "chemical", "disease", "gene"]
259 | 
260 | # HTTP methods
261 | VALID_HTTP_METHODS = ["GET", "POST"]
262 | 
263 | # Trial search defaults
264 | DEFAULT_TRIAL_FORMAT = "csv"
265 | DEFAULT_TRIAL_MARKUP = "markdown"
266 | 
267 | # ============================================================================
268 | # Error Messages
269 | # ============================================================================
270 | 
271 | ERROR_THOUGHT_NUMBER_MIN = "Error: thoughtNumber must be >= 1"
272 | ERROR_TOTAL_THOUGHTS_MIN = "Error: totalThoughts must be >= 1"
273 | ERROR_DOMAIN_REQUIRED = "Either 'query' or 'domain' parameter must be provided"
274 | ERROR_THOUGHT_REQUIRED = (
275 |     "'thought' parameter is required when domain='thinking'"
276 | )
277 | ERROR_THOUGHT_NUMBER_REQUIRED = (
278 |     "'thoughtNumber' parameter is required when domain='thinking'"
279 | )
280 | ERROR_TOTAL_THOUGHTS_REQUIRED = (
281 |     "'totalThoughts' parameter is required when domain='thinking'"
282 | )
283 | ERROR_NEXT_THOUGHT_REQUIRED = (
284 |     "'nextThoughtNeeded' parameter is required when domain='thinking'"
285 | )
286 | 
287 | # ============================================================================
288 | # API Response Formatting
289 | # ============================================================================
290 | 
291 | # Default values for missing data
292 | DEFAULT_TITLE = "Untitled"
293 | DEFAULT_GENE = "Unknown"
294 | DEFAULT_SIGNIFICANCE = "Unknown"
295 | 
296 | # Metadata field names
297 | METADATA_YEAR = "year"
298 | METADATA_JOURNAL = "journal"
299 | METADATA_AUTHORS = "authors"
300 | METADATA_STATUS = "status"
301 | METADATA_PHASE = "phase"
302 | METADATA_START_DATE = "start_date"
303 | METADATA_COMPLETION_DATE = "primary_completion_date"
304 | METADATA_GENE = "gene"
305 | METADATA_RSID = "rsid"
306 | METADATA_SIGNIFICANCE = "clinical_significance"
307 | METADATA_CONSEQUENCE = "consequence"
308 | METADATA_SOURCE = "source"
309 | 
310 | # Result field names
311 | RESULT_ID = "id"
312 | RESULT_TITLE = "title"
313 | RESULT_SNIPPET = "snippet"  # Internal use for domain handlers
314 | RESULT_TEXT = "text"  # OpenAI MCP compliant field name
315 | RESULT_URL = "url"
316 | RESULT_METADATA = "metadata"
317 | RESULT_DATA = "data"
318 | RESULT_PAGE = "page"
319 | RESULT_PAGE_SIZE = "page_size"
320 | RESULT_TOTAL = "total"
321 | RESULT_NEXT_PAGE = "next_page"
322 | 
```

--------------------------------------------------------------------------------
/docs/backend-services-reference/05-nci-cts-api.md:
--------------------------------------------------------------------------------

```markdown
  1 | # NCI Clinical Trials Search API Reference
  2 | 
  3 | The National Cancer Institute's Clinical Trials Search (CTS) API provides advanced search capabilities for cancer clinical trials with enhanced filtering options beyond ClinicalTrials.gov.
  4 | 
  5 | ## Overview
  6 | 
  7 | The NCI CTS API offers:
  8 | 
  9 | - Advanced biomarker and mutation filtering
 10 | - Comprehensive organization database
 11 | - Intervention and drug vocabularies
 12 | - Disease terminology with NCI Thesaurus integration
 13 | - Prior therapy and eligibility criteria
 14 | 
 15 | **Base URL:** `https://clinicaltrialsapi.cancer.gov/api/v2/`
 16 | 
 17 | ## Authentication
 18 | 
 19 | An API key is required for all endpoints.
 20 | 
 21 | ### Obtaining an API Key
 22 | 
 23 | 1. Visit [https://clinicaltrialsapi.cancer.gov/](https://clinicaltrialsapi.cancer.gov/)
 24 | 2. Click "Get API Key"
 25 | 3. Complete registration
 26 | 4. Key is emailed immediately
 27 | 
 28 | ### Using the API Key
 29 | 
 30 | Include in request headers:
 31 | 
 32 | ```
 33 | X-API-KEY: your-api-key-here
 34 | ```
 35 | 
 36 | Or as query parameter:
 37 | 
 38 | ```
 39 | ?api_key=your-api-key-here
 40 | ```
 41 | 
 42 | ## Core Endpoints
 43 | 
 44 | ### 1. Trial Search
 45 | 
 46 | ```
 47 | GET /trials
 48 | ```
 49 | 
 50 | Search for clinical trials with advanced filtering.
 51 | 
 52 | #### Parameters
 53 | 
 54 | **Basic Search:**
 55 | 
 56 | - `keyword`: General text search
 57 | - `nct_id`: Specific NCT identifiers
 58 | - `diseases`: Disease/condition names
 59 | - `interventions`: Treatment names
 60 | 
 61 | **Advanced Filters:**
 62 | 
 63 | - `biomarkers`: Required biomarkers/mutations
 64 | - `prior_therapy_required`: true/false
 65 | - `accepts_brain_mets`: true/false
 66 | - `min_age`: Minimum age in years
 67 | - `max_age`: Maximum age in years
 68 | 
 69 | **Pagination:**
 70 | 
 71 | - `size`: Results per page (max 50)
 72 | - `from`: Starting index (offset)
 73 | 
 74 | #### Example Request
 75 | 
 76 | ```bash
 77 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/trials" \
 78 |   -H "X-API-KEY: your-key" \
 79 |   -d "diseases=melanoma" \
 80 |   -d "biomarkers=BRAF V600E" \
 81 |   -d "accepts_brain_mets=true" \
 82 |   -d "size=10"
 83 | ```
 84 | 
 85 | #### Response Format
 86 | 
 87 | ```json
 88 | {
 89 |   "total": 42,
 90 |   "trials": [
 91 |     {
 92 |       "nct_id": "NCT04280705",
 93 |       "brief_title": "BRAF/MEK Inhibitor Combination",
 94 |       "current_trial_status": "Active",
 95 |       "phase": "Phase II",
 96 |       "biomarker_eligibility": [
 97 |         {
 98 |           "gene": "BRAF",
 99 |           "variant": "V600E",
100 |           "required": true
101 |         }
102 |       ],
103 |       "sites": [...]
104 |     }
105 |   ]
106 | }
107 | ```
108 | 
109 | ### 2. Trial Details
110 | 
111 | ```
112 | GET /trials/{nct_id}
113 | ```
114 | 
115 | Get comprehensive information about a specific trial.
116 | 
117 | #### Example Request
118 | 
119 | ```bash
120 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/trials/NCT04280705" \
121 |   -H "X-API-KEY: your-key"
122 | ```
123 | 
124 | ### 3. Organization Search
125 | 
126 | ```
127 | GET /organizations
128 | ```
129 | 
130 | Search for cancer research organizations and treatment centers.
131 | 
132 | #### Parameters
133 | 
134 | - `name`: Organization name
135 | - `org_city`: City location
136 | - `org_state_or_province`: State/province
137 | - `org_country`: Country
138 | - `org_type`: Type (e.g., "NCI-designated", "academic")
139 | 
140 | **Important:** Always use city AND state together to avoid Elasticsearch errors.
141 | 
142 | #### Example Request
143 | 
144 | ```bash
145 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/organizations" \
146 |   -H "X-API-KEY: your-key" \
147 |   -d "org_city=Houston" \
148 |   -d "org_state_or_province=TX"
149 | ```
150 | 
151 | ### 4. Organization Details
152 | 
153 | ```
154 | GET /organizations/{org_id}
155 | ```
156 | 
157 | Get details about a specific organization.
158 | 
159 | ### 5. Intervention Search
160 | 
161 | ```
162 | GET /interventions
163 | ```
164 | 
165 | Search for drugs, devices, and procedures used in trials.
166 | 
167 | #### Parameters
168 | 
169 | - `name`: Intervention name
170 | - `type`: Drug, Device, Procedure, etc.
171 | - `synonyms`: Include synonym matches (default: true)
172 | 
173 | #### Example Request
174 | 
175 | ```bash
176 | curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/interventions" \
177 |   -H "X-API-KEY: your-key" \
178 |   -d "name=pembrolizumab" \
179 |   -d "type=Drug"
180 | ```
181 | 
182 | ### 6. Intervention Details
183 | 
184 | ```
185 | GET /interventions/{intervention_id}
186 | ```
187 | 
188 | ### 7. Biomarker Search
189 | 
190 | ```
191 | GET /biomarkers
192 | ```
193 | 
194 | Search for biomarkers used in trial eligibility criteria.
195 | 
196 | #### Parameters
197 | 
198 | - `name`: Biomarker name
199 | - `type`: mutation, expression, etc.
200 | - `gene`: Associated gene symbol
201 | 
202 | ### 8. Disease Search
203 | 
204 | ```
205 | GET /diseases
206 | ```
207 | 
208 | Search NCI's controlled vocabulary of cancer conditions.
209 | 
210 | #### Parameters
211 | 
212 | - `name`: Disease name
213 | - `include_synonyms`: Include synonym matches
214 | - `category`: Disease category
215 | 
216 | ## Advanced Features
217 | 
218 | ### Biomarker-Based Trial Search
219 | 
220 | Find trials requiring specific mutations:
221 | 
222 | ```python
223 | params = {
224 |     "diseases": "non-small cell lung cancer",
225 |     "biomarkers": ["EGFR L858R", "EGFR exon 19 deletion"],
226 |     "prior_therapy_required": False,
227 |     "accepts_brain_mets": True
228 | }
229 | 
230 | response = requests.get(
231 |     "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
232 |     headers={"X-API-KEY": api_key},
233 |     params=params
234 | )
235 | ```
236 | 
237 | ### Complex Eligibility Queries
238 | 
239 | ```python
240 | # Find trials with specific eligibility
241 | params = {
242 |     "diseases": "melanoma",
243 |     "biomarkers": "BRAF V600E",
244 |     "min_age": 18,
245 |     "max_age": 75,
246 |     "prior_therapy": "vemurafenib",  # Exclude if prior vemurafenib
247 |     "performance_status": "0-1"       # ECOG 0 or 1
248 | }
249 | ```
250 | 
251 | ### Organization Network Analysis
252 | 
253 | ```python
254 | # Find all NCI-designated centers in a region
255 | params = {
256 |     "org_type": "NCI-designated",
257 |     "org_state_or_province": ["CA", "OR", "WA"]  # West Coast
258 | }
259 | 
260 | orgs = requests.get(
261 |     "https://clinicaltrialsapi.cancer.gov/api/v2/organizations",
262 |     headers={"X-API-KEY": api_key},
263 |     params=params
264 | )
265 | 
266 | # Get trials at each center
267 | for org in orgs.json()["organizations"]:
268 |     trials = requests.get(
269 |         f"https://clinicaltrialsapi.cancer.gov/api/v2/trials",
270 |         headers={"X-API-KEY": api_key},
271 |         params={"site_org_id": org["id"]}
272 |     )
273 | ```
274 | 
275 | ## Data Models
276 | 
277 | ### Trial Object
278 | 
279 | ```json
280 | {
281 |   "nct_id": "NCT04280705",
282 |   "brief_title": "Study Title",
283 |   "official_title": "Full Protocol Title",
284 |   "current_trial_status": "Active",
285 |   "phase": "Phase II",
286 |   "study_type": "Interventional",
287 |   "primary_purpose": "Treatment",
288 |   "diseases": [
289 |     {
290 |       "name": "Melanoma",
291 |       "nci_thesaurus_id": "C0025202"
292 |     }
293 |   ],
294 |   "biomarker_eligibility": [
295 |     {
296 |       "gene": "BRAF",
297 |       "variant": "V600E",
298 |       "required": true,
299 |       "inclusion": true
300 |     }
301 |   ],
302 |   "arms": [...],
303 |   "sites": [...]
304 | }
305 | ```
306 | 
307 | ### Organization Object
308 | 
309 | ```json
310 | {
311 |   "org_id": "NCI-2021-00123",
312 |   "name": "MD Anderson Cancer Center",
313 |   "type": "NCI-designated",
314 |   "address": {
315 |     "city": "Houston",
316 |     "state": "TX",
317 |     "country": "United States",
318 |     "postal_code": "77030"
319 |   },
320 |   "contact": {
321 |     "name": "Clinical Trials Office",
322 |     "phone": "1-800-392-1611",
323 |     "email": "[email protected]"
324 |   },
325 |   "active_trials_count": 1250
326 | }
327 | ```
328 | 
329 | ## Error Handling
330 | 
331 | ### Common Errors
332 | 
333 | #### 401 Unauthorized
334 | 
335 | ```json
336 | {
337 |   "error": "Invalid or missing API key"
338 | }
339 | ```
340 | 
341 | #### 400 Bad Request
342 | 
343 | ```json
344 | {
345 |   "error": "Invalid parameter combination",
346 |   "details": "Must specify both city AND state for location search"
347 | }
348 | ```
349 | 
350 | #### 429 Rate Limited
351 | 
352 | ```json
353 | {
354 |   "error": "Rate limit exceeded",
355 |   "retry_after": 3600
356 | }
357 | ```
358 | 
359 | ### Best Practices
360 | 
361 | 1. **Always use city AND state together** for location searches
362 | 2. **Handle missing totals** - the API may not return total counts with size parameter
363 | 3. **Use specific searches** - broad queries may timeout
364 | 4. **Implement retry logic** for rate limits
365 | 
366 | ## Rate Limits
367 | 
368 | - **With API Key**: 1,000 requests/day
369 | - **Burst Rate**: 10 requests/second
370 | - **Without Key**: Not supported
371 | 
372 | ## Differences from ClinicalTrials.gov
373 | 
374 | ### Enhanced Features
375 | 
376 | - **Biomarker search**: Mutation-specific queries
377 | - **Prior therapy**: Exclude based on previous treatments
378 | - **Brain metastases**: Specific acceptance criteria
379 | - **Performance status**: ECOG/Karnofsky filtering
380 | 
381 | ### Limitations
382 | 
383 | - **Cancer trials only**: Limited to oncology studies
384 | - **No offset pagination**: Must use size parameter carefully
385 | - **Location parameters**: Different naming (org\_ prefix)
386 | 
387 | ## Integration Examples
388 | 
389 | ### Example 1: Precision Medicine Search
390 | 
391 | ```python
392 | async def find_precision_trials(mutation, cancer_type, location):
393 |     """Find trials for specific mutation in cancer type near location"""
394 | 
395 |     # Search for trials
396 |     trial_params = {
397 |         "diseases": cancer_type,
398 |         "biomarkers": mutation,
399 |         "accepts_brain_mets": True,
400 |         "size": 50
401 |     }
402 | 
403 |     trials = await fetch_nci_api("trials", trial_params)
404 | 
405 |     # Filter by location if provided
406 |     if location:
407 |         nearby_trials = []
408 |         for trial in trials["trials"]:
409 |             for site in trial.get("sites", []):
410 |                 distance = calculate_distance(location, site["coordinates"])
411 |                 if distance < 100:  # 100 miles
412 |                     nearby_trials.append(trial)
413 |                     break
414 | 
415 |         return nearby_trials
416 | 
417 |     return trials["trials"]
418 | ```
419 | 
420 | ### Example 2: Biomarker-Driven Pipeline
421 | 
422 | ```python
423 | def biomarker_trial_pipeline(gene, variant):
424 |     """Complete pipeline from variant to trials"""
425 | 
426 |     # 1. Search biomarkers
427 |     biomarkers = requests.get(
428 |         "https://clinicaltrialsapi.cancer.gov/api/v2/biomarkers",
429 |         headers={"X-API-KEY": api_key},
430 |         params={"gene": gene, "name": variant}
431 |     ).json()
432 | 
433 |     # 2. Get associated trials
434 |     all_trials = []
435 |     for biomarker in biomarkers.get("biomarkers", []):
436 |         trials = requests.get(
437 |             "https://clinicaltrialsapi.cancer.gov/api/v2/trials",
438 |             headers={"X-API-KEY": api_key},
439 |             params={"biomarker_id": biomarker["id"]}
440 |         ).json()
441 |         all_trials.extend(trials.get("trials", []))
442 | 
443 |     # 3. Deduplicate and sort by phase
444 |     unique_trials = {t["nct_id"]: t for t in all_trials}.values()
445 |     return sorted(unique_trials, key=lambda x: x.get("phase", ""))
446 | ```
447 | 
448 | ## Support Resources
449 | 
450 | - **API Documentation**: [https://clinicaltrialsapi.cancer.gov/](https://clinicaltrialsapi.cancer.gov/)
451 | - **Support Email**: [email protected]
452 | - **Status Page**: [https://status.cancer.gov/](https://status.cancer.gov/)
453 | - **Terms of Use**: [https://clinicaltrialsapi.cancer.gov/terms](https://clinicaltrialsapi.cancer.gov/terms)
454 | 
```

--------------------------------------------------------------------------------
/src/biomcp/openfda/drug_approvals.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | OpenFDA drug approvals (Drugs@FDA) integration.
  3 | """
  4 | 
  5 | import logging
  6 | from typing import Any
  7 | 
  8 | from .constants import (
  9 |     OPENFDA_DEFAULT_LIMIT,
 10 |     OPENFDA_DISCLAIMER,
 11 |     OPENFDA_DRUGSFDA_URL,
 12 | )
 13 | from .utils import (
 14 |     format_count,
 15 |     make_openfda_request,
 16 | )
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | async def search_drug_approvals(
 22 |     drug: str | None = None,
 23 |     application_number: str | None = None,
 24 |     approval_year: str | None = None,
 25 |     limit: int = OPENFDA_DEFAULT_LIMIT,
 26 |     skip: int = 0,
 27 |     api_key: str | None = None,
 28 | ) -> str:
 29 |     """
 30 |     Search FDA drug approval records from Drugs@FDA.
 31 | 
 32 |     Args:
 33 |         drug: Drug name (brand or generic) to search for
 34 |         application_number: NDA or BLA application number
 35 |         approval_year: Year of approval (YYYY format)
 36 |         limit: Maximum number of results to return
 37 |         skip: Number of results to skip (for pagination)
 38 | 
 39 |         api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var)
 40 | 
 41 |     Returns:
 42 |         Formatted string with drug approval information
 43 |     """
 44 |     # Build search query
 45 |     search_params = {}
 46 | 
 47 |     if drug:
 48 |         # Search both brand and generic names
 49 |         search_params["search"] = (
 50 |             f'(openfda.brand_name:"{drug}" OR '
 51 |             f'openfda.generic_name:"{drug}" OR '
 52 |             f'openfda.substance_name:"{drug}")'
 53 |         )
 54 |     elif application_number:
 55 |         search_params["search"] = f'application_number:"{application_number}"'
 56 |     elif approval_year:
 57 |         # Search for approvals in a specific year
 58 |         search_params["search"] = (
 59 |             f"products.marketing_status_date:[{approval_year}-01-01 TO {approval_year}-12-31]"
 60 |         )
 61 | 
 62 |     # Add pagination
 63 |     search_params["limit"] = str(min(limit, 100))
 64 |     search_params["skip"] = str(skip)
 65 | 
 66 |     # Sort by submission date (most recent first)
 67 |     search_params["sort"] = "submissions.submission_status_date:desc"
 68 | 
 69 |     # Make the request
 70 |     response, error = await make_openfda_request(
 71 |         OPENFDA_DRUGSFDA_URL, search_params, "openfda_approvals", api_key
 72 |     )
 73 | 
 74 |     if error:
 75 |         return f"⚠️ Error searching drug approvals: {error}"
 76 | 
 77 |     if not response or not response.get("results"):
 78 |         return "No drug approval records found matching your criteria."
 79 | 
 80 |     # Format the results
 81 |     results = response["results"]
 82 |     total = (
 83 |         response.get("meta", {}).get("results", {}).get("total", len(results))
 84 |     )
 85 | 
 86 |     output = ["## FDA Drug Approval Records\n"]
 87 | 
 88 |     if drug:
 89 |         output.append(f"**Drug**: {drug}")
 90 |     if application_number:
 91 |         output.append(f"**Application**: {application_number}")
 92 |     if approval_year:
 93 |         output.append(f"**Approval Year**: {approval_year}")
 94 | 
 95 |     output.append(
 96 |         f"**Total Records Found**: {format_count(total, 'record')}\n"
 97 |     )
 98 | 
 99 |     # Show results
100 |     output.append(f"### Results (showing {len(results)} of {total}):\n")
101 | 
102 |     for i, record in enumerate(results, 1):
103 |         output.extend(_format_approval_summary(record, i))
104 | 
105 |     output.append(f"\n{OPENFDA_DISCLAIMER}")
106 | 
107 |     return "\n".join(output)
108 | 
109 | 
110 | async def get_drug_approval(
111 |     application_number: str,
112 |     api_key: str | None = None,
113 | ) -> str:
114 |     """
115 |     Get detailed drug approval information for a specific application.
116 | 
117 |     Args:
118 |         application_number: NDA or BLA application number
119 | 
120 |         api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var)
121 | 
122 |     Returns:
123 |         Formatted string with detailed approval information
124 |     """
125 |     # Search for the specific application
126 |     search_params = {
127 |         "search": f'application_number:"{application_number}"',
128 |         "limit": 1,
129 |     }
130 | 
131 |     response, error = await make_openfda_request(
132 |         OPENFDA_DRUGSFDA_URL, search_params, "openfda_approvals", api_key
133 |     )
134 | 
135 |     if error:
136 |         return f"⚠️ Error retrieving drug approval: {error}"
137 | 
138 |     if not response or not response.get("results"):
139 |         return f"No approval record found for application {application_number}"
140 | 
141 |     record = response["results"][0]
142 | 
143 |     # Format detailed approval information
144 |     output = [f"## Drug Approval Details: {application_number}\n"]
145 | 
146 |     # Basic information
147 |     output.extend(_format_approval_header(record))
148 | 
149 |     # Products
150 |     if products := record.get("products"):
151 |         output.extend(_format_products(products))
152 | 
153 |     # Submissions history
154 |     if submissions := record.get("submissions"):
155 |         output.extend(_format_submissions(submissions))
156 | 
157 |     # OpenFDA metadata
158 |     if openfda := record.get("openfda"):
159 |         output.extend(_format_openfda_metadata(openfda))
160 | 
161 |     output.append(f"\n{OPENFDA_DISCLAIMER}")
162 | 
163 |     return "\n".join(output)
164 | 
165 | 
166 | def _format_approval_summary(record: dict[str, Any], num: int) -> list[str]:
167 |     """Format a single approval record summary."""
168 |     output = [
169 |         f"#### {num}. Application {record.get('application_number', 'Unknown')}"
170 |     ]
171 | 
172 |     # Get sponsor/applicant
173 |     if sponsor := record.get("sponsor_name"):
174 |         output.append(f"**Sponsor**: {sponsor}")
175 | 
176 |     # Get drug names from OpenFDA data
177 |     openfda = record.get("openfda", {})
178 |     if brand_names := openfda.get("brand_name"):
179 |         output.append(f"**Brand Name(s)**: {', '.join(brand_names[:3])}")
180 |     if generic_names := openfda.get("generic_name"):
181 |         output.append(f"**Generic Name(s)**: {', '.join(generic_names[:3])}")
182 | 
183 |     # Get products and their approval dates
184 |     if products := record.get("products"):
185 |         output.append("\n**Products**:")
186 |         for prod in products[:3]:
187 |             prod_num = prod.get("product_number", "?")
188 |             dosage = prod.get("dosage_form", "")
189 |             strength = prod.get("strength", "")
190 |             status = prod.get("marketing_status", "")
191 | 
192 |             prod_line = f"- Product {prod_num}: {dosage}"
193 |             if strength:
194 |                 prod_line += f" ({strength})"
195 |             if status:
196 |                 prod_line += f" - {status}"
197 |             output.append(prod_line)
198 | 
199 |     # Get most recent submission
200 |     if submissions := record.get("submissions"):
201 |         # Sort by date to get most recent
202 |         recent = submissions[0]
203 |         sub_type = recent.get("submission_type", "")
204 |         sub_status = recent.get("submission_status", "")
205 |         sub_date = recent.get("submission_status_date", "")
206 | 
207 |         if sub_date:
208 |             output.append(
209 |                 f"\n**Latest Activity**: {sub_type} - {sub_status} ({sub_date})"
210 |             )
211 | 
212 |     output.append("")
213 |     return output
214 | 
215 | 
216 | def _format_approval_header(record: dict[str, Any]) -> list[str]:
217 |     """Format the header section of detailed approval."""
218 |     output = ["### Application Information"]
219 | 
220 |     output.append(
221 |         f"**Application Number**: {record.get('application_number', 'Unknown')}"
222 |     )
223 | 
224 |     if sponsor := record.get("sponsor_name"):
225 |         output.append(f"**Sponsor**: {sponsor}")
226 | 
227 |     # OpenFDA names
228 |     openfda = record.get("openfda", {})
229 |     if brand_names := openfda.get("brand_name"):
230 |         output.append(f"**Brand Names**: {', '.join(brand_names)}")
231 |     if generic_names := openfda.get("generic_name"):
232 |         output.append(f"**Generic Names**: {', '.join(generic_names)}")
233 |     if substances := openfda.get("substance_name"):
234 |         output.append(f"**Active Substances**: {', '.join(substances)}")
235 | 
236 |     output.append("")
237 |     return output
238 | 
239 | 
240 | def _format_products(products: list[dict[str, Any]]) -> list[str]:
241 |     """Format product information."""
242 |     output = ["### Products"]
243 | 
244 |     for prod in products:
245 |         prod_num = prod.get("product_number", "Unknown")
246 |         output.append(f"\n#### Product {prod_num}")
247 | 
248 |         if dosage := prod.get("dosage_form"):
249 |             output.append(f"**Dosage Form**: {dosage}")
250 |         if strength := prod.get("strength"):
251 |             output.append(f"**Strength**: {strength}")
252 |         if route := prod.get("route"):
253 |             output.append(f"**Route**: {route}")
254 |         if status := prod.get("marketing_status"):
255 |             output.append(f"**Marketing Status**: {status}")
256 |         if status_date := prod.get("marketing_status_date"):
257 |             output.append(f"**Status Date**: {status_date}")
258 |         if te_code := prod.get("te_code"):
259 |             output.append(f"**Therapeutic Equivalence**: {te_code}")
260 | 
261 |     output.append("")
262 |     return output
263 | 
264 | 
265 | def _format_submissions(submissions: list[dict[str, Any]]) -> list[str]:
266 |     """Format submission history."""
267 |     output = ["### Submission History"]
268 | 
269 |     # Show most recent 5 submissions
270 |     for sub in submissions[:5]:
271 |         sub_num = sub.get("submission_number", "?")
272 |         sub_type = sub.get("submission_type", "Unknown")
273 |         sub_status = sub.get("submission_status", "")
274 |         sub_date = sub.get("submission_status_date", "")
275 | 
276 |         output.append(f"\n**Submission {sub_num}**: {sub_type}")
277 |         if sub_status:
278 |             output.append(f"- Status: {sub_status}")
279 |         if sub_date:
280 |             output.append(f"- Date: {sub_date}")
281 | 
282 |         # Review priority if present
283 |         if priority := sub.get("review_priority"):
284 |             output.append(f"- Review Priority: {priority}")
285 | 
286 |         # Submission class if present
287 |         if sub_class := sub.get("submission_class_code"):
288 |             class_desc = sub.get("submission_class_code_description", "")
289 |             output.append(f"- Class: {sub_class} - {class_desc}")
290 | 
291 |     output.append("")
292 |     return output
293 | 
294 | 
295 | def _format_openfda_metadata(openfda: dict[str, Any]) -> list[str]:
296 |     """Format OpenFDA metadata."""
297 |     output = ["### Additional Information"]
298 | 
299 |     if nui := openfda.get("nui"):
300 |         output.append(f"**NUI Codes**: {', '.join(nui[:5])}")
301 | 
302 |     if pharm_class := openfda.get("pharm_class_epc"):
303 |         output.append(f"**Pharmacologic Class**: {', '.join(pharm_class[:3])}")
304 | 
305 |     if moa := openfda.get("pharm_class_moa"):
306 |         output.append(f"**Mechanism of Action**: {', '.join(moa[:3])}")
307 | 
308 |     if unii := openfda.get("unii"):
309 |         output.append(f"**UNII Codes**: {', '.join(unii[:5])}")
310 | 
311 |     output.append("")
312 |     return output
313 | 
```

--------------------------------------------------------------------------------
/tests/tdd/variants/test_alphagenome_comprehensive.py:
--------------------------------------------------------------------------------

```python
  1 | """Comprehensive tests for AlphaGenome integration."""
  2 | 
  3 | from unittest.mock import MagicMock, patch
  4 | 
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | from biomcp.variants.alphagenome import (
  9 |     _validate_inputs,
 10 |     predict_variant_effects,
 11 | )
 12 | 
 13 | 
 14 | class TestInputValidation:
 15 |     """Test input validation for AlphaGenome."""
 16 | 
 17 |     def test_valid_chromosomes(self):
 18 |         """Test validation accepts valid chromosome formats."""
 19 |         valid_chroms = ["chr1", "chr22", "chrX", "chrY", "chrM", "chrMT"]
 20 |         for chrom in valid_chroms:
 21 |             # Should not raise
 22 |             _validate_inputs(chrom, 100, "A", "T")
 23 | 
 24 |     def test_invalid_chromosomes(self):
 25 |         """Test validation rejects invalid chromosome formats."""
 26 |         invalid_chroms = ["1", "chr23", "chrZ", "chromosome1", "Chr1", ""]
 27 |         for chrom in invalid_chroms:
 28 |             with pytest.raises(ValueError, match="Invalid chromosome format"):
 29 |                 _validate_inputs(chrom, 100, "A", "T")
 30 | 
 31 |     def test_invalid_position(self):
 32 |         """Test validation rejects invalid positions."""
 33 |         with pytest.raises(ValueError, match="Position must be >= 1"):
 34 |             _validate_inputs("chr1", 0, "A", "T")
 35 |         with pytest.raises(ValueError, match="Position must be >= 1"):
 36 |             _validate_inputs("chr1", -10, "A", "T")
 37 | 
 38 |     def test_valid_nucleotides(self):
 39 |         """Test validation accepts valid nucleotides."""
 40 |         valid_cases = [
 41 |             ("A", "T"),
 42 |             ("C", "G"),
 43 |             ("ACGT", "TGCA"),
 44 |             ("a", "t"),
 45 |             ("acgt", "tgca"),  # lowercase should work
 46 |         ]
 47 |         for ref, alt in valid_cases:
 48 |             # Should not raise
 49 |             _validate_inputs("chr1", 100, ref, alt)
 50 | 
 51 |     def test_invalid_nucleotides(self):
 52 |         """Test validation rejects invalid nucleotides."""
 53 |         invalid_cases = [("N", "A"), ("A", "U"), ("AXG", "T"), ("A", "123")]
 54 |         for ref, alt in invalid_cases:
 55 |             with pytest.raises(ValueError, match="Invalid nucleotides"):
 56 |                 _validate_inputs("chr1", 100, ref, alt)
 57 | 
 58 |     def test_empty_alleles(self):
 59 |         """Test validation rejects empty alleles."""
 60 |         with pytest.raises(
 61 |             ValueError, match="Reference allele cannot be empty"
 62 |         ):
 63 |             _validate_inputs("chr1", 100, "", "A")
 64 |         with pytest.raises(
 65 |             ValueError, match="Alternate allele cannot be empty"
 66 |         ):
 67 |             _validate_inputs("chr1", 100, "A", "")
 68 | 
 69 | 
 70 | class TestIntervalSizeCalculation:
 71 |     """Test interval size selection logic."""
 72 | 
 73 |     @pytest.mark.asyncio
 74 |     async def test_interval_size_edge_cases(self):
 75 |         """Test interval size selection for edge cases."""
 76 |         with patch.dict("os.environ", {}, clear=True):
 77 |             # Without API key, we should get early return
 78 |             result = await predict_variant_effects(
 79 |                 chromosome="chr1",
 80 |                 position=100,
 81 |                 reference="A",
 82 |                 alternate="T",
 83 |                 interval_size=2000000,  # Larger than max
 84 |             )
 85 |             assert "AlphaGenome API key required" in result
 86 | 
 87 | 
 88 | class TestCaching:
 89 |     """Test caching behavior."""
 90 | 
 91 |     @pytest.mark.asyncio
 92 |     async def test_skip_cache_parameter(self):
 93 |         """Test that skip_cache parameter works."""
 94 |         with patch.dict("os.environ", {}, clear=True):
 95 |             # First call
 96 |             result1 = await predict_variant_effects(
 97 |                 chromosome="chr1",
 98 |                 position=100,
 99 |                 reference="A",
100 |                 alternate="T",
101 |                 skip_cache=True,
102 |             )
103 | 
104 |             # Second call with skip_cache
105 |             result2 = await predict_variant_effects(
106 |                 chromosome="chr1",
107 |                 position=100,
108 |                 reference="A",
109 |                 alternate="T",
110 |                 skip_cache=True,
111 |             )
112 | 
113 |             # Both should show API key error
114 |             assert "AlphaGenome API key required" in result1
115 |             assert "AlphaGenome API key required" in result2
116 | 
117 | 
118 | class TestErrorHandling:
119 |     """Test error handling and context."""
120 | 
121 |     @pytest.mark.asyncio
122 |     async def test_error_context_with_api_key(self):
123 |         """Test that errors include proper context."""
124 |         with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}):
125 |             result = await predict_variant_effects(
126 |                 chromosome="chr1",
127 |                 position=100,
128 |                 reference="A",
129 |                 alternate="T",
130 |                 tissue_types=["UBERON:0002367"],
131 |                 skip_cache=True,
132 |             )
133 | 
134 |             # Should either get import error or API error with context
135 |             if "AlphaGenome prediction failed" in result:
136 |                 assert "Context:" in result
137 |                 assert "chr1:100 A>T" in result
138 |                 assert "Tissue types:" in result
139 | 
140 |     @pytest.mark.asyncio
141 |     async def test_input_validation_errors(self):
142 |         """Test that input validation errors are raised."""
143 |         with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}):
144 |             # Invalid chromosome
145 |             with pytest.raises(ValueError, match="Invalid chromosome format"):
146 |                 await predict_variant_effects(
147 |                     chromosome="invalid",
148 |                     position=100,
149 |                     reference="A",
150 |                     alternate="T",
151 |                 )
152 | 
153 |             # Invalid nucleotides
154 |             with pytest.raises(ValueError, match="Invalid nucleotides"):
155 |                 await predict_variant_effects(
156 |                     chromosome="chr1",
157 |                     position=100,
158 |                     reference="X",
159 |                     alternate="T",
160 |                 )
161 | 
162 | 
163 | class TestThresholdParameter:
164 |     """Test significance threshold parameter."""
165 | 
166 |     @pytest.mark.asyncio
167 |     async def test_custom_threshold(self):
168 |         """Test that custom threshold is accepted."""
169 |         with patch.dict("os.environ", {}, clear=True):
170 |             # Test with custom threshold
171 |             result = await predict_variant_effects(
172 |                 chromosome="chr1",
173 |                 position=100,
174 |                 reference="A",
175 |                 alternate="T",
176 |                 significance_threshold=0.8,
177 |             )
178 | 
179 |             # Should work (get API key error, not parameter error)
180 |             assert "AlphaGenome API key required" in result
181 | 
182 |     @pytest.mark.asyncio
183 |     async def test_default_threshold(self):
184 |         """Test that default threshold is used."""
185 |         with patch.dict("os.environ", {}, clear=True):
186 |             # Test without threshold parameter
187 |             result = await predict_variant_effects(
188 |                 chromosome="chr1",
189 |                 position=100,
190 |                 reference="A",
191 |                 alternate="T",
192 |             )
193 | 
194 |             # Should work with default
195 |             assert "AlphaGenome API key required" in result
196 | 
197 | 
198 | class TestIntegration:
199 |     """Integration tests with mocked AlphaGenome."""
200 | 
201 |     @pytest.mark.asyncio
202 |     async def test_successful_prediction_mock(self):
203 |         """Test successful prediction with mocked AlphaGenome."""
204 |         with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}):
205 |             # Mock the AlphaGenome imports
206 |             mock_genome = MagicMock()
207 |             mock_dna_client = MagicMock()
208 |             mock_variant_scorers = MagicMock()
209 | 
210 |             # Mock the model
211 |             mock_model = MagicMock()
212 |             mock_dna_client.create.return_value = mock_model
213 | 
214 |             # Mock scorers
215 |             mock_variant_scorers.get_recommended_scorers.return_value = [
216 |                 "scorer1"
217 |             ]
218 | 
219 |             # Mock scores DataFrame
220 |             mock_df = pd.DataFrame({
221 |                 "output_type": ["RNA_SEQ"],
222 |                 "raw_score": [1.0],
223 |                 "gene_name": ["GENE1"],
224 |                 "track_name": ["tissue1"],
225 |             })
226 |             mock_variant_scorers.tidy_scores.return_value = mock_df
227 | 
228 |             # Mock score_variant to return mock scores
229 |             mock_model.score_variant.return_value = [MagicMock()]
230 | 
231 |             # Patch the imports
232 |             with patch.dict(
233 |                 "sys.modules",
234 |                 {
235 |                     "alphagenome.data.genome": mock_genome,
236 |                     "alphagenome.models.dna_client": mock_dna_client,
237 |                     "alphagenome.models.variant_scorers": mock_variant_scorers,
238 |                     "alphagenome.data": MagicMock(genome=mock_genome),
239 |                     "alphagenome.models": MagicMock(
240 |                         dna_client=mock_dna_client,
241 |                         variant_scorers=mock_variant_scorers,
242 |                     ),
243 |                 },
244 |             ):
245 |                 result = await predict_variant_effects(
246 |                     chromosome="chr7",
247 |                     position=140753336,
248 |                     reference="A",
249 |                     alternate="T",
250 |                     interval_size=131072,
251 |                     skip_cache=True,
252 |                 )
253 | 
254 |                 # Check model was created with API key
255 |                 mock_dna_client.create.assert_called_once_with("test-key")
256 | 
257 |                 # Check interval was created correctly
258 |                 mock_genome.Interval.assert_called_once()
259 |                 call_args = mock_genome.Interval.call_args
260 |                 assert (
261 |                     call_args[1]["start"] == 140753336 - 65536 - 1
262 |                 )  # 0-based
263 |                 assert call_args[1]["end"] == call_args[1]["start"] + 131072
264 | 
265 |                 # Check variant was created
266 |                 mock_genome.Variant.assert_called_once_with(
267 |                     chromosome="chr7",
268 |                     position=140753336,
269 |                     reference_bases="A",
270 |                     alternate_bases="T",
271 |                 )
272 | 
273 |                 # Check result contains expected formatting
274 |                 assert "AlphaGenome Variant Effect Predictions" in result
275 |                 assert "Gene Expression" in result
276 |                 assert "GENE1" in result
277 | 
```

--------------------------------------------------------------------------------
/src/biomcp/trials/getter.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import logging
  3 | from ssl import TLSVersion
  4 | from typing import Annotated, Any
  5 | 
  6 | from .. import StrEnum, http_client, render
  7 | from ..constants import CLINICAL_TRIALS_BASE_URL
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class Module(StrEnum):
 13 |     PROTOCOL = "Protocol"
 14 |     LOCATIONS = "Locations"
 15 |     REFERENCES = "References"
 16 |     OUTCOMES = "Outcomes"
 17 |     ALL = "All"
 18 | 
 19 | 
 20 | modules: dict[Module, list[str]] = {
 21 |     Module.PROTOCOL: [
 22 |         "IdentificationModule",
 23 |         "StatusModule",
 24 |         "SponsorCollaboratorsModule",
 25 |         "OversightModule",
 26 |         "DescriptionModule",
 27 |         "ConditionsModule",
 28 |         "DesignModule",
 29 |         "ArmsInterventionsModule",
 30 |         "EligibilityModule",
 31 |     ],
 32 |     Module.LOCATIONS: ["ContactsLocationsModule"],
 33 |     Module.REFERENCES: ["ReferencesModule"],
 34 |     Module.OUTCOMES: ["OutcomesModule", "ResultsSection"],
 35 |     Module.ALL: [
 36 |         "IdentificationModule",
 37 |         "StatusModule",
 38 |         "SponsorCollaboratorsModule",
 39 |         "OversightModule",
 40 |         "DescriptionModule",
 41 |         "ConditionsModule",
 42 |         "DesignModule",
 43 |         "ArmsInterventionsModule",
 44 |         "EligibilityModule",
 45 |         "ContactsLocationsModule",
 46 |         "ReferencesModule",
 47 |         "OutcomesModule",
 48 |         "ResultsSection",
 49 |     ],
 50 | }
 51 | 
 52 | 
 53 | async def get_trial(
 54 |     nct_id: str,
 55 |     module: Module = Module.PROTOCOL,
 56 |     output_json: bool = False,
 57 | ) -> str:
 58 |     """Get details of a clinical trial by module."""
 59 |     fields = ",".join(modules[module])
 60 |     params = {"fields": fields}
 61 |     url = f"{CLINICAL_TRIALS_BASE_URL}/{nct_id}"
 62 | 
 63 |     logger.debug(f"Fetching trial {nct_id} with module {module.value}")
 64 |     logger.debug(f"URL: {url}, Params: {params}")
 65 | 
 66 |     parsed_data: dict[str, Any] | None
 67 |     error_obj: http_client.RequestError | None
 68 |     parsed_data, error_obj = await http_client.request_api(
 69 |         url=url,
 70 |         request=params,
 71 |         method="GET",
 72 |         tls_version=TLSVersion.TLSv1_2,
 73 |         response_model_type=None,
 74 |         domain="clinicaltrials",
 75 |     )
 76 | 
 77 |     data_to_return: dict[str, Any]
 78 | 
 79 |     if error_obj:
 80 |         logger.error(
 81 |             f"API Error for {nct_id}: {error_obj.code} - {error_obj.message}"
 82 |         )
 83 |         data_to_return = {
 84 |             "error": f"API Error {error_obj.code}",
 85 |             "details": error_obj.message,
 86 |         }
 87 |     elif parsed_data:
 88 |         # ClinicalTrials.gov API returns data wrapped in a "studies" array
 89 |         # Extract the first study if it exists
 90 |         if isinstance(parsed_data, dict) and "studies" in parsed_data:
 91 |             studies = parsed_data.get("studies", [])
 92 |             if studies and len(studies) > 0:
 93 |                 data_to_return = studies[0]
 94 |                 data_to_return["URL"] = (
 95 |                     f"https://clinicaltrials.gov/study/{nct_id}"
 96 |                 )
 97 |             else:
 98 |                 logger.warning(f"No studies found in response for {nct_id}")
 99 |                 data_to_return = {
100 |                     "error": f"No studies found for {nct_id}",
101 |                     "details": "API returned empty studies array",
102 |                 }
103 |         else:
104 |             # Handle case where API returns data in unexpected format
105 |             logger.debug(
106 |                 f"Unexpected response format for {nct_id}: {type(parsed_data)}"
107 |             )
108 |             data_to_return = parsed_data
109 |             data_to_return["URL"] = (
110 |                 f"https://clinicaltrials.gov/study/{nct_id}"
111 |             )
112 |     else:
113 |         logger.warning(
114 |             f"No data received for {nct_id} with module {module.value}"
115 |         )
116 |         data_to_return = {
117 |             "error": f"No data found for {nct_id} with module {module.value}",
118 |             "details": "API returned no data",
119 |         }
120 | 
121 |     if output_json:
122 |         return json.dumps(data_to_return, indent=2)
123 |     else:
124 |         return render.to_markdown(data_to_return)
125 | 
126 | 
127 | async def _trial_protocol(
128 |     call_benefit: Annotated[
129 |         str,
130 |         "Define and summarize why this function is being called and the intended benefit",
131 |     ],
132 |     nct_id: str,
133 | ):
134 |     """
135 |     Retrieves core protocol information for a single clinical
136 |     trial identified by its NCT ID.
137 | 
138 |     Parameters:
139 |     - call_benefit: Define and summarize why this function is being called and the intended benefit
140 |     - nct_id: A single NCT ID (string, e.g., "NCT04280705")
141 | 
142 |     Process: Fetches standard "Protocol" view modules (like ID,
143 |              Status, Sponsor, Design, Eligibility) from the
144 |              ClinicalTrials.gov v2 API.
145 |     Output: A Markdown formatted string detailing title, status,
146 |             sponsor, purpose, study design, phase, interventions,
147 |             eligibility criteria, etc. Returns error if invalid.
148 |     """
149 |     return await get_trial(nct_id, Module.PROTOCOL)
150 | 
151 | 
152 | async def _trial_locations(
153 |     call_benefit: Annotated[
154 |         str,
155 |         "Define and summarize why this function is being called and the intended benefit",
156 |     ],
157 |     nct_id: str,
158 | ) -> str:
159 |     """
160 |     Retrieves contact and location details for a single
161 |     clinical trial identified by its NCT ID.
162 | 
163 |     Parameters:
164 |     - call_benefit: Define and summarize why this function is being called and the intended benefit
165 |     - nct_id: A single NCT ID (string, e.g., "NCT04280705")
166 | 
167 |     Process: Fetches the `ContactsLocationsModule` from the
168 |              ClinicalTrials.gov v2 API for the given NCT ID.
169 |     Output: A Markdown formatted string detailing facility names,
170 |             addresses (city, state, country), and contact info.
171 |             Returns an error message if the NCT ID is invalid.
172 |     """
173 |     return await get_trial(nct_id, Module.LOCATIONS)
174 | 
175 | 
176 | async def _trial_outcomes(
177 |     call_benefit: Annotated[
178 |         str,
179 |         "Define and summarize why this function is being called and the intended benefit",
180 |     ],
181 |     nct_id: str,
182 | ) -> str:
183 |     """
184 |     Retrieves outcome measures, results (if available), and
185 |     adverse event data for a single clinical trial.
186 | 
187 |     Parameters:
188 |     - call_benefit: Define and summarize why this function is being called and the intended benefit
189 |     - nct_id: A single NCT ID (string, e.g., "NCT04280705")
190 | 
191 |     Process: Fetches the `OutcomesModule` and `ResultsSection`
192 |              from the ClinicalTrials.gov v2 API for the NCT ID.
193 |     Output: A Markdown formatted string detailing primary/secondary
194 |             outcomes, participant flow, results tables (if posted),
195 |             and adverse event summaries. Returns an error if invalid.
196 |     """
197 |     return await get_trial(nct_id, Module.OUTCOMES)
198 | 
199 | 
200 | async def _trial_references(
201 |     call_benefit: Annotated[
202 |         str,
203 |         "Define and summarize why this function is being called and the intended benefit",
204 |     ],
205 |     nct_id: str,
206 | ):
207 |     """
208 |     Retrieves publications and other references associated with
209 |     a single clinical trial identified by its NCT ID.
210 | 
211 |     Parameters:
212 |     - call_benefit: Define and summarize why this function is being called and the intended benefit
213 |     - nct_id: A single NCT ID (string, e.g., "NCT04280705")
214 | 
215 |     Process: Fetches the `ReferencesModule` from the
216 |              ClinicalTrials.gov v2 API for the NCT ID.
217 |     Output: A Markdown formatted string listing citations,
218 |             associated PubMed IDs (PMIDs), and reference types
219 |             (e.g., result publication). Returns error if invalid.
220 |     """
221 |     return await get_trial(nct_id, Module.REFERENCES)
222 | 
223 | 
224 | async def get_trial_unified(
225 |     nct_id: str,
226 |     source: str = "clinicaltrials",
227 |     api_key: str | None = None,
228 |     sections: list[str] | None = None,
229 | ) -> str:
230 |     """
231 |     Get trial details from either ClinicalTrials.gov or NCI CTS API.
232 | 
233 |     Args:
234 |         nct_id: NCT identifier (e.g., "NCT04280705")
235 |         source: Data source - "clinicaltrials" (default) or "nci"
236 |         api_key: API key for NCI (required if source="nci")
237 |         sections: List of sections to include (for clinicaltrials.gov)
238 |                  Options: ["protocol", "locations", "outcomes", "references", "all"]
239 | 
240 |     Returns:
241 |         Formatted markdown string with trial details
242 |     """
243 |     if source == "nci":
244 |         # Import here to avoid circular imports
245 |         from .nci_getter import format_nci_trial_details, get_trial_nci
246 | 
247 |         trial_data = await get_trial_nci(nct_id, api_key)
248 |         return await format_nci_trial_details(trial_data, api_key)
249 |     else:
250 |         # Default to ClinicalTrials.gov
251 |         if sections and "all" in sections:
252 |             return await get_trial(nct_id, Module.ALL)
253 |         elif sections:
254 |             # Get specific sections
255 |             results = []
256 |             for section in sections:
257 |                 if section == "protocol":
258 |                     results.append(
259 |                         await _trial_protocol(
260 |                             call_benefit=f"Getting protocol information for trial {nct_id}",
261 |                             nct_id=nct_id,
262 |                         )
263 |                     )
264 |                 elif section == "locations":
265 |                     results.append(
266 |                         await _trial_locations(
267 |                             call_benefit=f"Getting locations for trial {nct_id}",
268 |                             nct_id=nct_id,
269 |                         )
270 |                     )
271 |                 elif section == "outcomes":
272 |                     results.append(
273 |                         await _trial_outcomes(
274 |                             call_benefit=f"Getting outcomes for trial {nct_id}",
275 |                             nct_id=nct_id,
276 |                         )
277 |                     )
278 |                 elif section == "references":
279 |                     results.append(
280 |                         await _trial_references(
281 |                             call_benefit=f"Getting references for trial {nct_id}",
282 |                             nct_id=nct_id,
283 |                         )
284 |                     )
285 |             return "\n\n---\n\n".join(results)
286 |         else:
287 |             # Default to protocol only
288 |             return await _trial_protocol(
289 |                 call_benefit=f"Getting trial protocol details for {nct_id}",
290 |                 nct_id=nct_id,
291 |             )
292 | 
```

--------------------------------------------------------------------------------
/src/biomcp/biomarkers/search.py:
--------------------------------------------------------------------------------

```python
  1 | """Search functionality for biomarkers via NCI CTS API.
  2 | 
  3 | Note: Biomarker data availability may be limited in CTRP.
  4 | This module focuses on biomarkers used in trial eligibility criteria.
  5 | """
  6 | 
  7 | import logging
  8 | from typing import Any
  9 | 
 10 | from ..constants import NCI_BIOMARKERS_URL
 11 | from ..integrations.cts_api import CTSAPIError, make_cts_request
 12 | from ..utils import parse_or_query
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def _build_biomarker_params(
 18 |     name: str | None,
 19 |     eligibility_criterion: str | None,
 20 |     biomarker_type: str | None,
 21 |     codes: list[str] | None,
 22 |     assay_purpose: str | None,
 23 |     include: list[str] | None,
 24 |     sort: str | None,
 25 |     order: str | None,
 26 |     page_size: int,
 27 | ) -> dict[str, Any]:
 28 |     """Build query parameters for biomarker search."""
 29 |     params: dict[str, Any] = {"size": page_size}
 30 | 
 31 |     # Add search filters with correct API parameter names
 32 |     if name:
 33 |         params["name"] = name
 34 |     if eligibility_criterion:
 35 |         params["eligibility_criterion"] = eligibility_criterion
 36 |     if biomarker_type:
 37 |         params["type"] = biomarker_type
 38 |     if codes:
 39 |         params["codes"] = ",".join(codes) if isinstance(codes, list) else codes
 40 |     if assay_purpose:
 41 |         params["assay_purpose"] = assay_purpose
 42 |     if include:
 43 |         params["include"] = (
 44 |             ",".join(include) if isinstance(include, list) else include
 45 |         )
 46 |     if sort:
 47 |         params["sort"] = sort
 48 |         if order:
 49 |             params["order"] = order.lower()
 50 | 
 51 |     return params
 52 | 
 53 | 
 54 | def _process_biomarker_response(
 55 |     response: dict[str, Any],
 56 |     page: int,
 57 |     page_size: int,
 58 | ) -> dict[str, Any]:
 59 |     """Process biomarker API response."""
 60 |     biomarkers = response.get("data", response.get("biomarkers", []))
 61 |     total = response.get("total", len(biomarkers))
 62 | 
 63 |     result = {
 64 |         "biomarkers": biomarkers,
 65 |         "total": total,
 66 |         "page": page,
 67 |         "page_size": page_size,
 68 |     }
 69 | 
 70 |     # Add note about data limitations if response indicates it
 71 |     if response.get("limited_data") or not biomarkers:
 72 |         result["note"] = (
 73 |             "Biomarker data availability is limited in CTRP. "
 74 |             "Results show biomarkers referenced in trial eligibility criteria. "
 75 |             "For detailed variant annotations, use variant_searcher with MyVariant.info."
 76 |         )
 77 | 
 78 |     return result
 79 | 
 80 | 
 81 | async def search_biomarkers(
 82 |     name: str | None = None,
 83 |     eligibility_criterion: str | None = None,
 84 |     biomarker_type: str | None = None,
 85 |     codes: list[str] | None = None,
 86 |     assay_purpose: str | None = None,
 87 |     include: list[str] | None = None,
 88 |     sort: str | None = None,
 89 |     order: str | None = None,
 90 |     page_size: int = 20,
 91 |     page: int = 1,
 92 |     api_key: str | None = None,
 93 | ) -> dict[str, Any]:
 94 |     """
 95 |     Search for biomarkers in the NCI CTS database.
 96 | 
 97 |     Note: Biomarker data availability may be limited per CTRP documentation.
 98 |     Results focus on biomarkers used in clinical trial eligibility criteria.
 99 | 
100 |     Args:
101 |         name: Biomarker name to search for (e.g., "PD-L1", "EGFR mutation")
102 |         eligibility_criterion: Eligibility criterion text
103 |         biomarker_type: Type of biomarker ("reference_gene" or "branch")
104 |         codes: List of biomarker codes
105 |         assay_purpose: Purpose of the assay
106 |         include: Fields to include in response
107 |         sort: Sort field
108 |         order: Sort order ('asc' or 'desc')
109 |         page_size: Number of results per page
110 |         page: Page number
111 |         api_key: Optional API key (if not provided, uses NCI_API_KEY env var)
112 | 
113 |     Returns:
114 |         Dictionary with search results containing:
115 |         - biomarkers: List of biomarker records
116 |         - total: Total number of results
117 |         - page: Current page
118 |         - page_size: Results per page
119 |         - note: Any limitations about the data
120 | 
121 |     Raises:
122 |         CTSAPIError: If the API request fails
123 |     """
124 |     # Build query parameters
125 |     params = _build_biomarker_params(
126 |         name,
127 |         eligibility_criterion,
128 |         biomarker_type,
129 |         codes,
130 |         assay_purpose,
131 |         include,
132 |         sort,
133 |         order,
134 |         page_size,
135 |     )
136 | 
137 |     try:
138 |         # Make API request
139 |         response = await make_cts_request(
140 |             url=NCI_BIOMARKERS_URL,
141 |             params=params,
142 |             api_key=api_key,
143 |         )
144 | 
145 |         # Process response
146 |         return _process_biomarker_response(response, page, page_size)
147 | 
148 |     except CTSAPIError:
149 |         raise
150 |     except Exception as e:
151 |         logger.error(f"Failed to search biomarkers: {e}")
152 |         raise CTSAPIError(f"Biomarker search failed: {e!s}") from e
153 | 
154 | 
155 | def _format_biomarker_header(total: int, note: str) -> list[str]:
156 |     """Format the header section of biomarker results."""
157 |     lines = [
158 |         f"## Biomarker Search Results ({total} found)",
159 |         "",
160 |     ]
161 | 
162 |     if note:
163 |         lines.extend([
164 |             f"*Note: {note}*",
165 |             "",
166 |         ])
167 | 
168 |     return lines
169 | 
170 | 
171 | def _format_single_biomarker(biomarker: dict[str, Any]) -> list[str]:
172 |     """Format a single biomarker record."""
173 |     bio_id = biomarker.get("id", biomarker.get("biomarker_id", "Unknown"))
174 |     name = biomarker.get("name", "Unknown Biomarker")
175 |     gene = biomarker.get("gene", biomarker.get("gene_symbol", ""))
176 |     bio_type = biomarker.get("type", biomarker.get("category", ""))
177 | 
178 |     lines = [
179 |         f"### {name}",
180 |         f"- **ID**: {bio_id}",
181 |     ]
182 | 
183 |     if gene:
184 |         lines.append(f"- **Gene**: {gene}")
185 |     if bio_type:
186 |         lines.append(f"- **Type**: {bio_type}")
187 | 
188 |     # Add assay information if available
189 |     if biomarker.get("assay_type"):
190 |         lines.append(f"- **Assay**: {biomarker['assay_type']}")
191 | 
192 |     # Add criteria examples if available
193 |     if biomarker.get("criteria_examples"):
194 |         examples = biomarker["criteria_examples"]
195 |         if isinstance(examples, list) and examples:
196 |             lines.append("- **Example Criteria**:")
197 |             for ex in examples[:3]:  # Show up to 3 examples
198 |                 lines.append(f"  - {ex}")
199 |             if len(examples) > 3:
200 |                 lines.append(f"  *(and {len(examples) - 3} more)*")
201 | 
202 |     # Add trial count if available
203 |     if biomarker.get("trial_count"):
204 |         lines.append(
205 |             f"- **Trials Using This Biomarker**: {biomarker['trial_count']}"
206 |         )
207 | 
208 |     lines.append("")
209 |     return lines
210 | 
211 | 
212 | async def search_biomarkers_with_or(
213 |     name_query: str,
214 |     eligibility_criterion: str | None = None,
215 |     biomarker_type: str | None = None,
216 |     codes: list[str] | None = None,
217 |     assay_purpose: str | None = None,
218 |     include: list[str] | None = None,
219 |     sort: str | None = None,
220 |     order: str | None = None,
221 |     page_size: int = 20,
222 |     page: int = 1,
223 |     api_key: str | None = None,
224 | ) -> dict[str, Any]:
225 |     """
226 |     Search for biomarkers with OR query support.
227 | 
228 |     This function handles OR queries by making multiple API calls and combining results.
229 |     For example: "PD-L1 OR CD274 OR programmed death ligand 1" will search for each term.
230 | 
231 |     Args:
232 |         name_query: Name query that may contain OR operators
233 |         Other args same as search_biomarkers
234 | 
235 |     Returns:
236 |         Combined results from all searches with duplicates removed
237 |     """
238 |     # Check if this is an OR query
239 |     if " OR " in name_query or " or " in name_query:
240 |         search_terms = parse_or_query(name_query)
241 |         logger.info(f"Parsed OR query into terms: {search_terms}")
242 |     else:
243 |         # Single term search
244 |         search_terms = [name_query]
245 | 
246 |     # Collect all unique biomarkers
247 |     all_biomarkers = {}
248 |     total_found = 0
249 | 
250 |     # Search for each term
251 |     for term in search_terms:
252 |         logger.info(f"Searching biomarkers for term: {term}")
253 |         try:
254 |             results = await search_biomarkers(
255 |                 name=term,
256 |                 eligibility_criterion=eligibility_criterion,
257 |                 biomarker_type=biomarker_type,
258 |                 codes=codes,
259 |                 assay_purpose=assay_purpose,
260 |                 include=include,
261 |                 sort=sort,
262 |                 order=order,
263 |                 page_size=page_size,  # Get full page size for each term
264 |                 page=page,
265 |                 api_key=api_key,
266 |             )
267 | 
268 |             # Add unique biomarkers (deduplicate by ID)
269 |             for biomarker in results.get("biomarkers", []):
270 |                 bio_id = biomarker.get("id", biomarker.get("biomarker_id"))
271 |                 if bio_id and bio_id not in all_biomarkers:
272 |                     all_biomarkers[bio_id] = biomarker
273 | 
274 |             total_found += results.get("total", 0)
275 | 
276 |         except Exception as e:
277 |             logger.warning(f"Failed to search for term '{term}': {e}")
278 |             # Continue with other terms
279 | 
280 |     # Convert back to list and apply pagination
281 |     unique_biomarkers = list(all_biomarkers.values())
282 | 
283 |     # Sort if requested (by name by default for consistent results)
284 |     if sort == "name" or sort is None:
285 |         unique_biomarkers.sort(key=lambda x: x.get("name", "").lower())
286 | 
287 |     # Apply pagination to combined results
288 |     start_idx = (page - 1) * page_size
289 |     end_idx = start_idx + page_size
290 |     paginated_biomarkers = unique_biomarkers[start_idx:end_idx]
291 | 
292 |     return {
293 |         "biomarkers": paginated_biomarkers,
294 |         "total": len(unique_biomarkers),
295 |         "page": page,
296 |         "page_size": page_size,
297 |         "search_terms": search_terms,  # Include what we searched for
298 |         "total_found_across_terms": total_found,  # Total before deduplication
299 |     }
300 | 
301 | 
302 | def format_biomarker_results(results: dict[str, Any]) -> str:
303 |     """
304 |     Format biomarker search results as markdown.
305 | 
306 |     Args:
307 |         results: Search results dictionary
308 | 
309 |     Returns:
310 |         Formatted markdown string
311 |     """
312 |     biomarkers = results.get("biomarkers", [])
313 |     total = results.get("total", 0)
314 |     note = results.get("note", "")
315 | 
316 |     if not biomarkers:
317 |         msg = "No biomarkers found matching the search criteria."
318 |         if note:
319 |             msg += f"\n\n*Note: {note}*"
320 |         return msg
321 | 
322 |     # Build markdown output
323 |     lines = _format_biomarker_header(total, note)
324 | 
325 |     for biomarker in biomarkers:
326 |         lines.extend(_format_single_biomarker(biomarker))
327 | 
328 |     return "\n".join(lines)
329 | 
```