This is page 11 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- ```markdown 1 | # Changelog 2 | 3 | All notable changes to the BioMCP project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.6.2] - 2025-08-05 9 | 10 | ### Added 11 | 12 | - **NCI Clinical Trials Search API Integration** - Enhanced cancer trial search capabilities: 13 | - Dual source support for trial search/getter tools (ClinicalTrials.gov + NCI) 14 | - NCI API key handling via `NCI_API_KEY` environment variable or parameter 15 | - Advanced trial filters: biomarkers, prior therapy, brain metastases acceptance 16 | - **6 New MCP Tools** for NCI-specific searches: 17 | - `nci_organization_searcher` / `nci_organization_getter`: Cancer centers, hospitals, research institutions 18 | - `nci_intervention_searcher` / `nci_intervention_getter`: Drugs, devices, procedures, biologicals 19 | - `nci_biomarker_searcher`: Trial eligibility biomarkers (reference genes, branches) 20 | - `nci_disease_searcher`: NCI's controlled vocabulary of cancer conditions 21 | - **OR Query Support**: All NCI endpoints support OR queries (e.g., "PD-L1 OR CD274") 22 | - Real-time access to NCI's curated cancer trials database 23 | - Automatic cBioPortal integration for gene searches 24 | - Proper NCI parameter mapping (org_city, org_state_or_province, etc.) 25 | - Comprehensive error handling for Elasticsearch limits 26 | 27 | ### Changed 28 | 29 | - Enhanced unified search router to properly handle NCI domains 30 | - Trial search/getter tools now accept `source` parameter ("clinicaltrials" or "nci") 31 | - Improved domain-specific search logic for query+domain combinations 32 | 33 | ### Added CLI Commands 34 | 35 | ```bash 36 | # Organization search/get 37 | biomcp organization search "MD Anderson" --api-key YOUR_KEY 38 | biomcp organization get 12345 --api-key YOUR_KEY 39 | 40 | # Intervention search/get 41 | biomcp intervention search pembrolizumab --type Drug --api-key YOUR_KEY 42 | biomcp intervention get 67890 --api-key YOUR_KEY 43 | 44 | # Biomarker search 45 | biomcp biomarker search --name "PD-L1" --api-key YOUR_KEY 46 | 47 | # Disease search 48 | biomcp disease search melanoma --source nci --api-key YOUR_KEY 49 | 50 | # Enhanced trial commands with source selection 51 | biomcp trial search --condition melanoma --source nci --api-key YOUR_KEY 52 | biomcp trial get NCT04280705 --source nci --api-key YOUR_KEY 53 | ``` 54 | 55 | ### Documentation 56 | 57 | - Added NCI tutorial with example prompts: `docs/tutorials/nci-prompts.md` 58 | - Created API parameter reference: `docs/api-changes/nci-api-parameters.md` 59 | - Updated CLAUDE.md with NCI usage instructions and parameter notes 60 | - Requires NCI API key from: https://clinicaltrialsapi.cancer.gov/ 61 | 62 | ## [0.6.0] - 2025-08-01 63 | 64 | ### Added 65 | 66 | - **Streamable HTTP Transport Support** (#45) - MCP specification version 2025-03-26: 67 | - Enabled FastMCP's native `/mcp` endpoint for Streamable HTTP transport 68 | - MCP specification compliant transport (2025-03-26 spec) via FastMCP 1.12.3+ 69 | - CLI support via `biomcp run --mode streamable_http` (uses native FastMCP implementation) 70 | - Full backward compatibility with legacy SSE endpoints 71 | - Cloudflare Worker updated with POST /mcp route for full spec compliance 72 | - Simplified worker implementation to leverage FastMCP's built-in transport support 73 | - Added comprehensive integration tests for streamable HTTP functionality 74 | - New transport protocol documentation guide 75 | 76 | ### Changed 77 | 78 | - Enhanced CLI with transport modes (stdio, worker, streamable_http) 79 | - Added configurable host and port options for HTTP-based transports 80 | - Simplified server modes by removing redundant `http` mode 81 | - Cloudflare Worker now supports both GET and POST methods on /mcp endpoint 82 | - Pinned FastMCP dependency to version range >=1.12.3,<2.0.0 for stability 83 | - Standardized documentation file naming to lowercase with hyphens for consistency 84 | 85 | ### Migration Notes 86 | 87 | - **From SSE to Streamable HTTP**: Update your server startup from `--mode worker` to `--mode streamable_http` 88 | - **Docker deployments**: Ensure you're using `--host 0.0.0.0` for proper container networking 89 | - **Cloudflare Workers**: The worker now automatically handles both transport types on `/mcp` 90 | - See the new [Transport Protocol Guide](https://biomcp.org/transport-protocol/) for detailed migration instructions 91 | 92 | ## [0.5.0] - 2025-08-01 93 | 94 | ### Added 95 | 96 | - **BioThings Integration** for real-time biomedical data access: 97 | - **New MCP Tools** (3 tools added, total now 17): 98 | - `gene_getter`: Query MyGene.info for gene information (symbols, names, summaries) 99 | - `drug_getter`: Query MyChem.info for drug/chemical data (formulas, indications, mechanisms) 100 | - `disease_getter`: Query MyDisease.info for disease information (definitions, synonyms, ontologies) 101 | - **Unified Search/Fetch Enhancement**: 102 | - Added `gene`, `drug`, `disease` as new searchable domains alongside article, trial, variant 103 | - Integrated into unified search syntax: `search(domain="gene", keywords=["BRAF"])` 104 | - Query language support: `gene:BRAF`, `drug:pembrolizumab`, `disease:melanoma` 105 | - Full fetch support: `fetch(domain="drug", id="DB00945")` 106 | - **Clinical Trial Enhancement**: 107 | - Automatic disease synonym expansion for trial searches 108 | - Real-time synonym lookup from MyDisease.info 109 | - Example: searching for "GIST" automatically includes "gastrointestinal stromal tumor" 110 | - **Smart Caching & Performance**: 111 | - Batch operations for multiple gene/drug lookups 112 | - Intelligent caching with TTL (gene: 24h, drug: 48h, disease: 72h) 113 | - Rate limiting to respect API guidelines 114 | 115 | ### Changed 116 | 117 | - Trial search now expands disease terms by default (disable with `expand_synonyms=False`) 118 | - Enhanced error handling for BioThings API responses 119 | - Improved network reliability with automatic retries 120 | 121 | ## [0.4.6] - 2025-07-09 122 | 123 | ### Added 124 | 125 | - MkDocs documentation deployment 126 | 127 | ## [0.4.5] - 2025-07-09 128 | 129 | ### Added 130 | 131 | - Unified search and fetch tools following OpenAI MCP guidelines 132 | - Additional variant sources (TCGA/GDC, 1000 Genomes) enabled by default in fetch operations 133 | - Additional article sources (bioRxiv, medRxiv, Europe PMC) enabled by default in search operations 134 | 135 | ### Changed 136 | 137 | - Consolidated 10 separate MCP tools into 2 unified tools (search and fetch) 138 | - Updated response formats to comply with OpenAI MCP specifications 139 | 140 | ### Fixed 141 | 142 | - OpenAI MCP compliance issues to enable integration 143 | 144 | ## [0.4.4] - 2025-07-08 145 | 146 | ### Added 147 | 148 | - **Performance Optimizations**: 149 | - Connection pooling with event loop lifecycle management (30% latency reduction) 150 | - Parallel test execution with pytest-xdist (5x faster test runs) 151 | - Request batching for cBioPortal API calls (80% fewer API calls) 152 | - Smart caching with LRU eviction and fast hash keys (10x faster cache operations) 153 | - Major performance improvements achieving ~3x faster test execution (120s → 42s) 154 | 155 | ### Fixed 156 | 157 | - Non-critical ASGI errors suppressed 158 | - Performance issues in article_searcher 159 | 160 | ## [0.4.3] - 2025-07-08 161 | 162 | ### Added 163 | 164 | - Complete HTTP centralization and improved code quality 165 | - Comprehensive constants module for better maintainability 166 | - Domain-specific handlers for result formatting 167 | - Parameter parser for robust input validation 168 | - Custom exception hierarchy for better error handling 169 | 170 | ### Changed 171 | 172 | - Refactored domain handlers to use static methods for better performance 173 | - Enhanced type safety throughout the codebase 174 | - Refactored complex functions to meet code quality standards 175 | 176 | ### Fixed 177 | 178 | - Type errors in router.py for full mypy compliance 179 | - Complex functions exceeding cyclomatic complexity thresholds 180 | 181 | ## [0.4.2] - 2025-07-07 182 | 183 | ### Added 184 | 185 | - Europe PMC DOI support for article fetching 186 | - Pagination support for Europe PMC searches 187 | - OR logic support for variant notation searches (e.g., R173 vs Arg173 vs p.R173) 188 | 189 | ### Changed 190 | 191 | - Enhanced variant notation search capabilities 192 | 193 | ## [0.4.1] - 2025-07-03 194 | 195 | ### Added 196 | 197 | - AlphaGenome as an optional dependency to predict variant effects on gene regulation 198 | - Per-request API key support for AlphaGenome integration 199 | - AI predictions to complement existing database lookups 200 | 201 | ### Security 202 | 203 | - Comprehensive sanitization in Cloudflare Worker to prevent sensitive data logging 204 | - Secure usage in hosted environments where users provide their own keys 205 | 206 | ## [0.4.0] - 2025-06-27 207 | 208 | ### Added 209 | 210 | - **cBioPortal Integration** for article searches: 211 | - Automatic gene-level mutation summaries when searching with gene parameters 212 | - Mutation-specific search capabilities (e.g., BRAF V600E, SRSF2 F57\*) 213 | - Dynamic cancer type resolution using cBioPortal API 214 | - Smart caching and rate limiting for optimal performance 215 | 216 | ## [0.3.3] - 2025-06-20 217 | 218 | ### Changed 219 | 220 | - Release workflow updates 221 | 222 | ## [0.3.2] - 2025-06-20 223 | 224 | ### Changed 225 | 226 | - Release workflow updates 227 | 228 | ## [0.3.1] - 2025-06-20 229 | 230 | ### Fixed 231 | 232 | - Build and release process improvements 233 | 234 | ## [0.3.0] - 2025-06-20 235 | 236 | ### Added 237 | 238 | - Expanded search capabilities 239 | - Integration tests for MCP server functionality 240 | - Utility modules for gene validation, mutation filtering, and request caching 241 | 242 | ## [0.2.1] - 2025-06-19 243 | 244 | ### Added 245 | 246 | - Remote MCP policies 247 | 248 | ## [0.2.0] - 2025-06-17 249 | 250 | ### Added 251 | 252 | - Sequential thinking tool for systematic problem-solving 253 | - Session-based thinking to replace global state 254 | - Extracted router handlers to reduce complexity 255 | 256 | ### Changed 257 | 258 | - Replaced global state in thinking module with session management 259 | 260 | ### Removed 261 | 262 | - Global state from sequential thinking module 263 | 264 | ### Fixed 265 | 266 | - Race conditions in sequential thinking with concurrent usage 267 | 268 | ## [0.1.11] - 2025-06-12 269 | 270 | ### Added 271 | 272 | - Advanced eligibility criteria filters to clinical trial search 273 | 274 | ## [0.1.10] - 2025-05-21 275 | 276 | ### Added 277 | 278 | - OAuth support on the Cloudflare worker via Stytch 279 | 280 | ## [0.1.9] - 2025-05-17 281 | 282 | ### Fixed 283 | 284 | - Refactor: Bump minimum Python version to 3.10 285 | 286 | ## [0.1.8] - 2025-05-14 287 | 288 | ### Fixed 289 | 290 | - Article searcher fixes 291 | 292 | ## [0.1.7] - 2025-05-07 293 | 294 | ### Added 295 | 296 | - Remote OAuth support 297 | 298 | ## [0.1.6] - 2025-05-05 299 | 300 | ### Added 301 | 302 | - Updates to handle cursor integration 303 | 304 | ## [0.1.5] - 2025-05-01 305 | 306 | ### Added 307 | 308 | - Updates to smithery yaml to account for object types needed for remote calls 309 | - Documentation and Lzyank updates 310 | 311 | ## [0.1.3] - 2025-05-01 312 | 313 | ### Added 314 | 315 | - Health check functionality to assist with API call issues 316 | - System resources and network & environment information gathering 317 | - Remote MCP capability via Cloudflare using SSE 318 | 319 | ## [0.1.2] - 2025-04-18 320 | 321 | ### Added 322 | 323 | - Researcher persona and BioMCP v0.1.2 release 324 | - Deep Researcher Persona blog post 325 | - Researcher persona video demo 326 | 327 | ## [0.1.1] - 2025-04-14 328 | 329 | ### Added 330 | 331 | - Claude Desktop and MCP Inspector tutorials 332 | - Improved Claude Desktop Tutorial for BioMCP 333 | - Troubleshooting guide and blog post 334 | 335 | ### Fixed 336 | 337 | - Log tool names as comma separated string 338 | - Server hanging issues 339 | - Error responses in variant count check 340 | 341 | ## [0.1.0] - 2025-04-08 342 | 343 | ### Added 344 | 345 | - Initial release of BioMCP 346 | - PubMed/PubTator3 article search integration 347 | - ClinicalTrials.gov trial search integration 348 | - MyVariant.info variant search integration 349 | - CLI interface for direct usage 350 | - MCP server for AI assistant integration 351 | - Cloudflare Worker support for remote deployment 352 | - Comprehensive test suite with pytest-bdd 353 | - GenomOncology introduction 354 | - Blog post on AI-assisted clinical trial search 355 | - MacOS troubleshooting guide 356 | 357 | ### Security 358 | 359 | - API keys properly externalized 360 | - Input validation using Pydantic models 361 | - Safe string handling in all API calls 362 | 363 | [Unreleased]: https://github.com/genomoncology/biomcp/compare/v0.6.2...HEAD 364 | [0.6.2]: https://github.com/genomoncology/biomcp/releases/tag/v0.6.2 365 | [0.6.0]: https://github.com/genomoncology/biomcp/releases/tag/v0.6.0 366 | [0.5.0]: https://github.com/genomoncology/biomcp/releases/tag/v0.5.0 367 | [0.4.6]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.6 368 | [0.4.5]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.5 369 | [0.4.4]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.4 370 | [0.4.3]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.3 371 | [0.4.2]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.2 372 | [0.4.1]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.1 373 | [0.4.0]: https://github.com/genomoncology/biomcp/releases/tag/v0.4.0 374 | [0.3.3]: https://github.com/genomoncology/biomcp/releases/tag/v0.3.3 375 | [0.3.2]: https://github.com/genomoncology/biomcp/releases/tag/v0.3.2 376 | [0.3.1]: https://github.com/genomoncology/biomcp/releases/tag/v0.3.1 377 | [0.3.0]: https://github.com/genomoncology/biomcp/releases/tag/v0.3.0 378 | [0.2.1]: https://github.com/genomoncology/biomcp/releases/tag/v0.2.1 379 | [0.2.0]: https://github.com/genomoncology/biomcp/releases/tag/v0.2.0 380 | [0.1.11]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.11 381 | [0.1.10]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.10 382 | [0.1.9]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.9 383 | [0.1.8]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.8 384 | [0.1.7]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.7 385 | [0.1.6]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.6 386 | [0.1.5]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.5 387 | [0.1.3]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.3 388 | [0.1.2]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.2 389 | [0.1.1]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.1 390 | [0.1.0]: https://github.com/genomoncology/biomcp/releases/tag/v0.1.0 391 | ``` -------------------------------------------------------------------------------- /tests/tdd/openfda/test_drug_recalls.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for FDA drug recall search and retrieval.""" 2 | 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | 7 | from biomcp.openfda.drug_recalls import ( 8 | get_drug_recall, 9 | search_drug_recalls, 10 | ) 11 | 12 | 13 | class TestDrugRecalls: 14 | """Test FDA drug recall functions.""" 15 | 16 | @pytest.mark.asyncio 17 | async def test_search_drug_recalls_success(self): 18 | """Test successful drug recall search.""" 19 | mock_response = { 20 | "meta": {"results": {"skip": 0, "limit": 10, "total": 2}}, 21 | "results": [ 22 | { 23 | "recall_number": "D-123-2024", 24 | "status": "Ongoing", 25 | "classification": "Class II", 26 | "product_description": "Metformin HCl Extended Release Tablets, 500mg", 27 | "reason_for_recall": "Presence of N-Nitrosodimethylamine (NDMA) impurity above acceptable limits", 28 | "recalling_firm": "Generic Pharma Inc", 29 | "city": "New York", 30 | "state": "NY", 31 | "country": "United States", 32 | "recall_initiation_date": "20240115", 33 | "center_classification_date": "20240120", 34 | "termination_date": "", 35 | "report_date": "20240125", 36 | "code_info": "Lot# ABC123, EXP 06/2025", 37 | "product_quantity": "50,000 bottles", 38 | "distribution_pattern": "Nationwide", 39 | "voluntary_mandated": "Voluntary: Firm Initiated", 40 | "initial_firm_notification": "Letter", 41 | }, 42 | { 43 | "recall_number": "D-456-2024", 44 | "status": "Terminated", 45 | "classification": "Class I", 46 | "product_description": "Valsartan Tablets, 160mg", 47 | "reason_for_recall": "Contamination with carcinogenic impurity", 48 | "recalling_firm": "BigPharma Corp", 49 | "city": "Los Angeles", 50 | "state": "CA", 51 | "country": "United States", 52 | "recall_initiation_date": "20240101", 53 | "termination_date": "20240201", 54 | "report_date": "20240105", 55 | }, 56 | ], 57 | } 58 | 59 | with patch( 60 | "biomcp.openfda.drug_recalls.make_openfda_request" 61 | ) as mock_request: 62 | mock_request.return_value = (mock_response, None) 63 | 64 | result = await search_drug_recalls(drug="metformin", limit=10) 65 | 66 | # Check that result contains expected recall information 67 | assert "D-123-2024" in result 68 | assert "Metformin" in result 69 | assert "Class II" in result 70 | assert "NDMA" in result 71 | assert "Generic Pharma Inc" in result 72 | 73 | # Check for disclaimer 74 | assert "FDA Data Notice" in result 75 | 76 | # Check summary statistics 77 | assert "Total Recalls Found**: 2 recalls" in result 78 | assert "Ongoing" in result 79 | 80 | @pytest.mark.asyncio 81 | async def test_search_drug_recalls_by_classification(self): 82 | """Test drug recall search filtered by classification.""" 83 | mock_response = { 84 | "meta": {"results": {"skip": 0, "limit": 10, "total": 3}}, 85 | "results": [ 86 | { 87 | "recall_number": "D-001-2024", 88 | "classification": "Class I", 89 | "product_description": "Critical Drug A", 90 | "reason_for_recall": "Life-threatening contamination", 91 | "status": "Ongoing", 92 | }, 93 | { 94 | "recall_number": "D-002-2024", 95 | "classification": "Class I", 96 | "product_description": "Critical Drug B", 97 | "reason_for_recall": "Severe adverse reactions", 98 | "status": "Ongoing", 99 | }, 100 | ], 101 | } 102 | 103 | with patch( 104 | "biomcp.openfda.drug_recalls.make_openfda_request" 105 | ) as mock_request: 106 | mock_request.return_value = (mock_response, None) 107 | 108 | result = await search_drug_recalls( 109 | recall_class="Class I", limit=10 110 | ) 111 | 112 | assert "Class I" in result 113 | assert "Total Recalls Found**: 3 recalls" in result 114 | assert "Life-threatening" in result 115 | assert "🔴 **Class I**" in result # High severity indicator 116 | 117 | @pytest.mark.asyncio 118 | async def test_search_drug_recalls_no_results(self): 119 | """Test drug recall search with no results.""" 120 | mock_response = { 121 | "meta": {"results": {"skip": 0, "limit": 10, "total": 0}}, 122 | "results": [], 123 | } 124 | 125 | with patch( 126 | "biomcp.openfda.drug_recalls.make_openfda_request" 127 | ) as mock_request: 128 | mock_request.return_value = (mock_response, None) 129 | 130 | result = await search_drug_recalls( 131 | drug="nonexistentdrug999", limit=10 132 | ) 133 | 134 | assert "No drug recall records found" in result 135 | 136 | @pytest.mark.asyncio 137 | async def test_get_drug_recall_success(self): 138 | """Test successful retrieval of specific drug recall.""" 139 | mock_response = { 140 | "results": [ 141 | { 142 | "recall_number": "D-123-2024", 143 | "status": "Ongoing", 144 | "classification": "Class II", 145 | "product_description": "Metformin HCl Extended Release Tablets, 500mg, 90 count bottles", 146 | "reason_for_recall": "Presence of N-Nitrosodimethylamine (NDMA) impurity above the acceptable daily intake limit of 96 ng/day", 147 | "recalling_firm": "Generic Pharma Inc", 148 | "address1": "123 Pharma Street", 149 | "city": "New York", 150 | "state": "NY", 151 | "postal_code": "10001", 152 | "country": "United States", 153 | "recall_initiation_date": "20240115", 154 | "center_classification_date": "20240120", 155 | "report_date": "20240125", 156 | "code_info": "Lot Numbers: ABC123 (EXP 06/2025), DEF456 (EXP 07/2025), GHI789 (EXP 08/2025)", 157 | "product_quantity": "50,000 bottles", 158 | "distribution_pattern": "Nationwide distribution to pharmacies and distributors", 159 | "voluntary_mandated": "Voluntary: Firm Initiated", 160 | "initial_firm_notification": "Letter", 161 | "openfda": { 162 | "application_number": ["ANDA123456"], 163 | "brand_name": ["METFORMIN HCL ER"], 164 | "generic_name": ["METFORMIN HYDROCHLORIDE"], 165 | "manufacturer_name": ["GENERIC PHARMA INC"], 166 | "product_ndc": ["12345-678-90"], 167 | "product_type": ["HUMAN PRESCRIPTION DRUG"], 168 | "route": ["ORAL"], 169 | "substance_name": ["METFORMIN HYDROCHLORIDE"], 170 | }, 171 | } 172 | ] 173 | } 174 | 175 | with patch( 176 | "biomcp.openfda.drug_recalls.make_openfda_request" 177 | ) as mock_request: 178 | mock_request.return_value = (mock_response, None) 179 | 180 | result = await get_drug_recall("D-123-2024") 181 | 182 | # Check basic information 183 | assert "D-123-2024" in result 184 | assert "Class II" in result 185 | assert "Metformin" in result 186 | assert "NDMA" in result 187 | 188 | # Check detailed information 189 | assert "Generic Pharma Inc" in result 190 | assert "New York, NY" in result 191 | assert "ABC123" in result 192 | assert "50,000 bottles" in result 193 | assert "Nationwide" in result 194 | 195 | # Check dates (should be formatted) 196 | assert "2024-01-15" in result # Formatted date 197 | 198 | # Check OpenFDA enrichment 199 | assert "METFORMIN HYDROCHLORIDE" in result 200 | assert "ORAL" in result 201 | 202 | # Check disclaimer 203 | assert "FDA Data Notice" in result 204 | 205 | @pytest.mark.asyncio 206 | async def test_get_drug_recall_not_found(self): 207 | """Test retrieval of non-existent drug recall.""" 208 | mock_response = {"results": []} 209 | 210 | with patch( 211 | "biomcp.openfda.drug_recalls.make_openfda_request" 212 | ) as mock_request: 213 | mock_request.return_value = (mock_response, None) 214 | 215 | result = await get_drug_recall("INVALID-RECALL-999") 216 | 217 | assert "No recall record found" in result 218 | assert "INVALID-RECALL-999" in result 219 | 220 | @pytest.mark.asyncio 221 | async def test_search_drug_recalls_api_error(self): 222 | """Test drug recall search with API error.""" 223 | with patch( 224 | "biomcp.openfda.drug_recalls.make_openfda_request" 225 | ) as mock_request: 226 | mock_request.return_value = (None, "Connection timeout") 227 | 228 | result = await search_drug_recalls(drug="aspirin") 229 | 230 | assert "Error searching drug recalls" in result 231 | assert "Connection timeout" in result 232 | 233 | @pytest.mark.asyncio 234 | async def test_search_by_recalling_firm(self): 235 | """Test drug recall search by recalling firm.""" 236 | mock_response = { 237 | "meta": {"results": {"skip": 0, "limit": 10, "total": 5}}, 238 | "results": [ 239 | { 240 | "recall_number": f"D-{i:03d}-2024", 241 | "recalling_firm": "Pfizer Inc", 242 | "product_description": f"Product {i}", 243 | "classification": "Class II", 244 | "status": "Ongoing", 245 | } 246 | for i in range(1, 6) 247 | ], 248 | } 249 | 250 | with patch( 251 | "biomcp.openfda.drug_recalls.make_openfda_request" 252 | ) as mock_request: 253 | mock_request.return_value = (mock_response, None) 254 | 255 | # Function doesn't support recalling_firm parameter 256 | # Test with drug parameter instead 257 | result = await search_drug_recalls(drug="aspirin", limit=10) 258 | 259 | # Just verify the results format 260 | assert "Pfizer Inc" in result # From mock data 261 | assert "Total Recalls Found**: 5 recalls" in result 262 | 263 | @pytest.mark.asyncio 264 | async def test_search_ongoing_recalls(self): 265 | """Test search for ongoing recalls only.""" 266 | mock_response = { 267 | "meta": {"results": {"skip": 0, "limit": 10, "total": 8}}, 268 | "results": [ 269 | { 270 | "recall_number": "D-100-2024", 271 | "status": "Ongoing", 272 | "classification": "Class II", 273 | "product_description": "Active Recall Product", 274 | "recall_initiation_date": "20240201", 275 | } 276 | ], 277 | } 278 | 279 | with patch( 280 | "biomcp.openfda.drug_recalls.make_openfda_request" 281 | ) as mock_request: 282 | mock_request.return_value = (mock_response, None) 283 | 284 | result = await search_drug_recalls(status="Ongoing", limit=10) 285 | 286 | assert "Ongoing" in result 287 | assert "Total Recalls Found**: 8 recalls" in result 288 | assert "Active Recall Product" in result 289 | 290 | def test_recall_classification_validation(self): 291 | """Test validation of recall classification values.""" 292 | from biomcp.openfda.validation import validate_recall 293 | 294 | # Valid recall with proper classification 295 | valid_recall = { 296 | "recall_number": "D-123-2024", 297 | "classification": "Class II", 298 | "product_description": "Test Product", 299 | } 300 | 301 | assert validate_recall(valid_recall) is True 302 | 303 | # Invalid classification should log warning but not fail 304 | invalid_recall = { 305 | "recall_number": "D-456-2024", 306 | "classification": "Class IV", # Invalid class 307 | "product_description": "Test Product", 308 | } 309 | 310 | # Should still return True but log warning 311 | assert validate_recall(invalid_recall) is True 312 | 313 | @pytest.mark.asyncio 314 | async def test_recall_summary_statistics(self): 315 | """Test that recall search provides proper summary statistics.""" 316 | mock_response = { 317 | "meta": {"results": {"skip": 0, "limit": 100, "total": 15}}, 318 | "results": [ 319 | {"classification": "Class I", "status": "Ongoing"} 320 | for _ in range(3) 321 | ] 322 | + [ 323 | {"classification": "Class II", "status": "Ongoing"} 324 | for _ in range(7) 325 | ] 326 | + [ 327 | {"classification": "Class III", "status": "Terminated"} 328 | for _ in range(5) 329 | ], 330 | } 331 | 332 | with patch( 333 | "biomcp.openfda.drug_recalls.make_openfda_request" 334 | ) as mock_request: 335 | mock_request.return_value = (mock_response, None) 336 | 337 | result = await search_drug_recalls(limit=100) 338 | 339 | # Should show classification breakdown 340 | assert "Class I" in result 341 | assert "Class II" in result 342 | assert "Class III" in result 343 | 344 | # Should show status summary 345 | assert "Ongoing" in result 346 | assert "Terminated" in result 347 | ``` -------------------------------------------------------------------------------- /docs/apis/error-codes.md: -------------------------------------------------------------------------------- ```markdown 1 | # Error Codes Reference 2 | 3 | This document provides a comprehensive list of error codes returned by BioMCP APIs, their meanings, and recommended actions. 4 | 5 | ## HTTP Status Codes 6 | 7 | ### Success Codes (2xx) 8 | 9 | | Code | Status | Description | 10 | | ---- | ---------- | ---------------------------------------- | 11 | | 200 | OK | Request successful | 12 | | 201 | Created | Resource created successfully | 13 | | 204 | No Content | Request successful, no content to return | 14 | 15 | ### Client Error Codes (4xx) 16 | 17 | | Code | Status | Description | Action | 18 | | ---- | -------------------- | -------------------------- | -------------------------------------- | 19 | | 400 | Bad Request | Invalid request parameters | Check parameter format and values | 20 | | 401 | Unauthorized | Missing or invalid API key | Verify API key is correct | 21 | | 403 | Forbidden | Access denied to resource | Check permissions for API key | 22 | | 404 | Not Found | Resource not found | Verify ID exists and is correct format | 23 | | 409 | Conflict | Resource conflict | Check for duplicate requests | 24 | | 422 | Unprocessable Entity | Validation error | Review validation errors in response | 25 | | 429 | Too Many Requests | Rate limit exceeded | Implement backoff and retry | 26 | 27 | ### Server Error Codes (5xx) 28 | 29 | | Code | Status | Description | Action | 30 | | ---- | --------------------- | ------------------------------- | --------------------------------- | 31 | | 500 | Internal Server Error | Server error | Retry with exponential backoff | 32 | | 502 | Bad Gateway | Upstream service error | Wait and retry | 33 | | 503 | Service Unavailable | Service temporarily unavailable | Check service status, retry later | 34 | | 504 | Gateway Timeout | Request timeout | Retry with smaller request | 35 | 36 | ## BioMCP-Specific Error Codes 37 | 38 | ### Article Errors (1xxx) 39 | 40 | | Code | Error | Description | Example | 41 | | ---- | -------------------- | --------------------------- | ------------------------------ | 42 | | 1001 | INVALID_PMID | Invalid PubMed ID format | "abc123" instead of "12345678" | 43 | | 1002 | ARTICLE_NOT_FOUND | Article does not exist | PMID not in PubMed | 44 | | 1003 | DOI_NOT_FOUND | DOI cannot be resolved | Invalid or non-existent DOI | 45 | | 1004 | PUBTATOR_ERROR | PubTator3 annotation failed | Service temporarily down | 46 | | 1005 | PREPRINT_NOT_INDEXED | Preprint not yet indexed | Recently submitted preprint | 47 | 48 | ### Trial Errors (2xxx) 49 | 50 | | Code | Error | Description | Example | 51 | | ---- | ---------------- | ------------------------------ | ---------------------------- | 52 | | 2001 | INVALID_NCT_ID | Invalid NCT ID format | Missing "NCT" prefix | 53 | | 2002 | TRIAL_NOT_FOUND | Trial does not exist | NCT ID not registered | 54 | | 2003 | INVALID_LOCATION | Invalid geographic coordinates | Latitude > 90 | 55 | | 2004 | NCI_API_REQUIRED | NCI API key required | Using NCI source without key | 56 | | 2005 | INVALID_STATUS | Invalid trial status | Status not recognized | 57 | 58 | ### Variant Errors (3xxx) 59 | 60 | | Code | Error | Description | Example | 61 | | ---- | -------------------- | --------------------------------- | ---------------------- | 62 | | 3001 | INVALID_HGVS | Invalid HGVS notation | Malformed HGVS string | 63 | | 3002 | VARIANT_NOT_FOUND | Variant not in database | Novel variant | 64 | | 3003 | INVALID_ASSEMBLY | Invalid genome assembly | Not hg19 or hg38 | 65 | | 3004 | COORDINATE_MISMATCH | Coordinates don't match reference | Position out of range | 66 | | 3005 | ALPHAGENOME_REQUIRED | AlphaGenome API key required | Prediction without key | 67 | 68 | ### Gene/Drug/Disease Errors (4xxx) 69 | 70 | | Code | Error | Description | Example | 71 | | ---- | --------------------- | --------------------------- | ------------------------ | 72 | | 4001 | GENE_NOT_FOUND | Gene symbol not recognized | Non-standard symbol | 73 | | 4002 | DRUG_NOT_FOUND | Drug/chemical not found | Misspelled drug name | 74 | | 4003 | DISEASE_NOT_FOUND | Disease term not recognized | Non-standard terminology | 75 | | 4004 | SPECIES_NOT_SUPPORTED | Only human genes supported | Requesting mouse gene | 76 | | 4005 | AMBIGUOUS_QUERY | Multiple matches found | Common drug name | 77 | 78 | ### Authentication Errors (5xxx) 79 | 80 | | Code | Error | Description | Action | 81 | | ---- | ------------------------ | ---------------------------------- | ------------------- | 82 | | 5001 | API_KEY_INVALID | API key format invalid | Check key format | 83 | | 5002 | API_KEY_EXPIRED | API key has expired | Renew API key | 84 | | 5003 | API_KEY_REVOKED | API key was revoked | Contact support | 85 | | 5004 | INSUFFICIENT_PERMISSIONS | API key lacks required permissions | Upgrade API key | 86 | | 5005 | IP_NOT_ALLOWED | IP address not whitelisted | Add IP to whitelist | 87 | 88 | ### Rate Limit Errors (6xxx) 89 | 90 | | Code | Error | Description | Headers | 91 | | ---- | -------------------- | ---------------------------- | ---------------------------- | 92 | | 6001 | RATE_LIMIT_EXCEEDED | Too many requests | X-RateLimit-Remaining: 0 | 93 | | 6002 | DAILY_LIMIT_EXCEEDED | Daily quota exceeded | X-RateLimit-Reset: timestamp | 94 | | 6003 | CONCURRENT_LIMIT | Too many concurrent requests | X-Concurrent-Limit: 10 | 95 | | 6004 | BURST_LIMIT_EXCEEDED | Short-term rate limit | Retry-After: 60 | 96 | 97 | ### Validation Errors (7xxx) 98 | 99 | | Code | Error | Description | Example | 100 | | ---- | ---------------------- | --------------------------- | ------------------------------- | 101 | | 7001 | MISSING_REQUIRED_FIELD | Required parameter missing | Missing gene for variant search | 102 | | 7002 | INVALID_FIELD_TYPE | Wrong parameter type | String instead of integer | 103 | | 7003 | VALUE_OUT_OF_RANGE | Value outside allowed range | Page number < 1 | 104 | | 7004 | INVALID_ENUM_VALUE | Invalid enumeration value | Phase "PHASE5" | 105 | | 7005 | MUTUALLY_EXCLUSIVE | Conflicting parameters | Both PMID and DOI provided | 106 | 107 | ### External Service Errors (8xxx) 108 | 109 | | Code | Error | Description | Service | 110 | | ---- | -------------------------- | ------------------------ | ---------------- | 111 | | 8001 | PUBMED_UNAVAILABLE | PubMed API down | NCBI E-utilities | 112 | | 8002 | CLINICALTRIALS_UNAVAILABLE | ClinicalTrials.gov down | CT.gov API | 113 | | 8003 | BIOTHINGS_UNAVAILABLE | BioThings API down | MyGene/MyVariant | 114 | | 8004 | CBIOPORTAL_UNAVAILABLE | cBioPortal unavailable | cBioPortal API | 115 | | 8005 | EXTERNAL_TIMEOUT | External service timeout | Any external API | 116 | 117 | ## Error Response Format 118 | 119 | ### Standard Error Response 120 | 121 | ```json 122 | { 123 | "error": { 124 | "code": 1002, 125 | "type": "ARTICLE_NOT_FOUND", 126 | "message": "Article with PMID 99999999 not found", 127 | "details": { 128 | "pmid": "99999999", 129 | "searched_in": ["pubmed", "pmc", "preprints"] 130 | } 131 | }, 132 | "request_id": "req_abc123", 133 | "timestamp": "2024-03-15T10:30:00Z" 134 | } 135 | ``` 136 | 137 | ### Validation Error Response 138 | 139 | ```json 140 | { 141 | "error": { 142 | "code": 7001, 143 | "type": "MISSING_REQUIRED_FIELD", 144 | "message": "Validation failed", 145 | "details": { 146 | "errors": [ 147 | { 148 | "field": "gene", 149 | "message": "Gene symbol is required for variant search" 150 | }, 151 | { 152 | "field": "assembly", 153 | "message": "Assembly must be 'hg19' or 'hg38'" 154 | } 155 | ] 156 | } 157 | } 158 | } 159 | ``` 160 | 161 | ### Rate Limit Error Response 162 | 163 | ```json 164 | { 165 | "error": { 166 | "code": 6001, 167 | "type": "RATE_LIMIT_EXCEEDED", 168 | "message": "Rate limit of 180 requests per minute exceeded", 169 | "details": { 170 | "limit": 180, 171 | "remaining": 0, 172 | "reset": 1710504000, 173 | "retry_after": 45 174 | } 175 | }, 176 | "headers": { 177 | "X-RateLimit-Limit": "180", 178 | "X-RateLimit-Remaining": "0", 179 | "X-RateLimit-Reset": "1710504000", 180 | "Retry-After": "45" 181 | } 182 | } 183 | ``` 184 | 185 | ## Error Handling Best Practices 186 | 187 | ### 1. Implement Exponential Backoff 188 | 189 | ```python 190 | import time 191 | import random 192 | 193 | def exponential_backoff(attempt: int, base_delay: float = 1.0): 194 | """Calculate exponential backoff with jitter.""" 195 | delay = base_delay * (2 ** attempt) 196 | jitter = random.uniform(0, delay * 0.1) 197 | return delay + jitter 198 | 199 | # Usage 200 | for attempt in range(5): 201 | try: 202 | response = await client.search(...) 203 | break 204 | except RateLimitError: 205 | delay = exponential_backoff(attempt) 206 | time.sleep(delay) 207 | ``` 208 | 209 | ### 2. Handle Specific Error Types 210 | 211 | ```python 212 | try: 213 | article = await client.articles.get(pmid) 214 | except BioMCPError as e: 215 | if e.code == 1002: # ARTICLE_NOT_FOUND 216 | # Try alternative sources 217 | article = await search_preprints(pmid) 218 | elif e.code == 6001: # RATE_LIMIT_EXCEEDED 219 | # Wait and retry 220 | time.sleep(e.retry_after) 221 | article = await client.articles.get(pmid) 222 | else: 223 | # Log and re-raise 224 | logger.error(f"Unexpected error: {e}") 225 | raise 226 | ``` 227 | 228 | ### 3. Parse Error Details 229 | 230 | ```python 231 | def handle_validation_error(error_response): 232 | """Extract and handle validation errors.""" 233 | if error_response["error"]["type"] == "VALIDATION_ERROR": 234 | for error in error_response["error"]["details"]["errors"]: 235 | field = error["field"] 236 | message = error["message"] 237 | print(f"Validation error on {field}: {message}") 238 | ``` 239 | 240 | ### 4. Monitor Rate Limits 241 | 242 | ```python 243 | class RateLimitMonitor: 244 | def __init__(self): 245 | self.limits = {} 246 | 247 | def update_from_headers(self, headers): 248 | """Update rate limit state from response headers.""" 249 | self.limits["remaining"] = int(headers.get("X-RateLimit-Remaining", 0)) 250 | self.limits["reset"] = int(headers.get("X-RateLimit-Reset", 0)) 251 | 252 | if self.limits["remaining"] < 10: 253 | logger.warning(f"Rate limit low: {self.limits['remaining']} remaining") 254 | 255 | def should_delay(self): 256 | """Check if we should delay before next request.""" 257 | return self.limits.get("remaining", 100) < 5 258 | ``` 259 | 260 | ## Common Error Scenarios 261 | 262 | ### Scenario 1: Gene Symbol Not Found 263 | 264 | **Error:** 265 | 266 | ```json 267 | { 268 | "error": { 269 | "code": 4001, 270 | "type": "GENE_NOT_FOUND", 271 | "message": "Gene symbol 'HER2' not found. Did you mean 'ERBB2'?", 272 | "details": { 273 | "query": "HER2", 274 | "suggestions": ["ERBB2", "ERBB2IP"] 275 | } 276 | } 277 | } 278 | ``` 279 | 280 | **Solution:** 281 | 282 | ```python 283 | try: 284 | gene = await client.genes.get("HER2") 285 | except GeneNotFoundError as e: 286 | if e.suggestions: 287 | # Try first suggestion 288 | gene = await client.genes.get(e.suggestions[0]) 289 | ``` 290 | 291 | ### Scenario 2: Location Search Without Coordinates 292 | 293 | **Error:** 294 | 295 | ```json 296 | { 297 | "error": { 298 | "code": 7001, 299 | "type": "MISSING_REQUIRED_FIELD", 300 | "message": "Latitude and longitude required for location search", 301 | "details": { 302 | "hint": "Use geocoding service to convert city names to coordinates" 303 | } 304 | } 305 | } 306 | ``` 307 | 308 | **Solution:** 309 | 310 | ```python 311 | # Use a geocoding service first 312 | coords = await geocode("Boston, MA") 313 | trials = await client.trials.search( 314 | conditions=["cancer"], 315 | lat=coords.lat, 316 | long=coords.long, 317 | distance=50 318 | ) 319 | ``` 320 | 321 | ### Scenario 3: API Key Required 322 | 323 | **Error:** 324 | 325 | ```json 326 | { 327 | "error": { 328 | "code": 2004, 329 | "type": "NCI_API_REQUIRED", 330 | "message": "NCI API key required for this operation", 331 | "details": { 332 | "get_key_url": "https://api.cancer.gov", 333 | "feature": "biomarker_search" 334 | } 335 | } 336 | } 337 | ``` 338 | 339 | **Solution:** 340 | 341 | ```python 342 | # Initialize client with API key 343 | client = BioMCPClient(nci_api_key=os.getenv("NCI_API_KEY")) 344 | 345 | # Or provide per-request 346 | trials = await client.trials.search( 347 | source="nci", 348 | conditions=["melanoma"], 349 | api_key="your-nci-key" 350 | ) 351 | ``` 352 | 353 | ## Debugging Tips 354 | 355 | ### 1. Enable Debug Logging 356 | 357 | ```python 358 | import logging 359 | 360 | logging.basicConfig(level=logging.DEBUG) 361 | logger = logging.getLogger("biomcp") 362 | ``` 363 | 364 | ### 2. Inspect Raw Responses 365 | 366 | ```python 367 | # Enable raw response mode 368 | client = BioMCPClient(debug=True) 369 | 370 | # Access raw response 371 | response = await client.articles.search(genes=["BRAF"]) 372 | print(response.raw_response) 373 | ``` 374 | 375 | ### 3. Capture Request IDs 376 | 377 | ```python 378 | try: 379 | result = await client.search(...) 380 | except BioMCPError as e: 381 | print(f"Request ID: {e.request_id}") 382 | # Include request_id when reporting issues 383 | ``` 384 | 385 | ## Support 386 | 387 | For error codes not listed here or persistent issues: 388 | 389 | 1. Check [FAQ](../faq-condensed.md) for common issues 390 | 2. Search [GitHub Issues](https://github.com/genomoncology/biomcp/issues) 391 | 3. Report new issues with: 392 | - Error code and message 393 | - Request ID if available 394 | - Minimal code to reproduce 395 | - BioMCP version 396 | ``` -------------------------------------------------------------------------------- /docs/policies.md: -------------------------------------------------------------------------------- ```markdown 1 | # GenomOncology Remote MCP 2 | 3 | **Privacy Policy** 4 | **Version 1.2 – Effective June 18, 2025** 5 | 6 | ## 1. Data We Collect 7 | 8 | | Type | Examples | Source | Storage | 9 | | ------------------------- | ---------------------------------------- | -------------------- | -------------- | 10 | | **Account** | Google user ID, email, display name | From Google OAuth | BigQuery | 11 | | **Queries** | Prompts, timestamps | User input | BigQuery | 12 | | **Operational** | IP address, user-agent | Automatic | Temporary only | 13 | | **Usage** | Token counts, latency, model performance | Derived metrics | Aggregated | 14 | | **Third-Party Responses** | API responses from PubMed, bioRxiv, etc. | Third-party services | Not stored | 15 | 16 | We do **not** collect sensitive health or demographic information. 17 | 18 | --- 19 | 20 | ## 2. How We Use It 21 | 22 | - Authenticate and secure the service 23 | - Improve quality, accuracy, and speed of model output 24 | - Analyze aggregate usage for insights 25 | - Monitor third-party API performance (without storing responses) 26 | - Comply with laws 27 | 28 | --- 29 | 30 | ## 3. Legal Basis (GDPR/UK) 31 | 32 | - **Contractual necessity** (Art. 6(1)(b) GDPR) 33 | - **Legitimate interests** (Art. 6(1)(f)) 34 | - **Consent**, where applicable 35 | 36 | --- 37 | 38 | ## 4. Who We Share With 39 | 40 | - **Google Cloud / Cloudflare** – Hosting & Auth 41 | - **API providers** – e.g., PubMed, bioRxiv 42 | - Your queries are transmitted to these services 43 | - We do not control their data retention practices 44 | - We do not store third-party responses 45 | - **Analytics tools** – e.g., BigQuery 46 | - **Authorities** – if required by law 47 | 48 | We **do not sell** your personal data. 49 | 50 | --- 51 | 52 | ## 5. Third-Party Data Handling 53 | 54 | When you use the Service: 55 | 56 | - Your queries may be sent to third-party APIs (PubMed, bioRxiv, TCGA, 1000 Genomes) 57 | - These services have their own privacy policies and data practices 58 | - We use third-party responses to generate output but do not store them 59 | - Third parties may independently retain query data per their policies 60 | - Only your username and queries are stored in our systems 61 | 62 | --- 63 | 64 | ## 6. Cookies 65 | 66 | We use only **Google OAuth** session cookies. 67 | No additional tracking cookies are set. 68 | 69 | --- 70 | 71 | ## 7. Data Retention 72 | 73 | - **BigQuery storage** (usernames & queries): Retained indefinitely 74 | - **Operational data** (IP, user-agent): Not retained 75 | - **Third-party responses**: Not stored 76 | - **Aggregated metrics**: Retained indefinitely 77 | - **Account Username**: Retained until deletion requested 78 | 79 | --- 80 | 81 | ## 8. Security 82 | 83 | - All data encrypted in transit (TLS 1.3) 84 | - Least-privilege access enforced via IAM 85 | - Username and query data stored in BigQuery with strict access control 86 | - Operational data (IP, user-agent) processed but not retained 87 | - **Incident Response**: Security incidents investigated within 24 hours 88 | - **Breach Notification**: Users notified within 72 hours of confirmed breach 89 | - **Security Audits**: Annual third-party security assessments 90 | - **Vulnerability Reporting**: See our [SECURITY.md](https://github.com/genomoncology/biomcp/blob/main/docs/biomcp-security.md) 91 | 92 | --- 93 | 94 | ## 9. International Transfers 95 | 96 | Data is stored in **Google Cloud's `us-central1`**. 97 | Transfers from the EU/UK rely on **SCCs**. 98 | 99 | --- 100 | 101 | ## 10. Your Rights 102 | 103 | Depending on your location, you may request to: 104 | 105 | - Access, correct, or delete your data 106 | - Restrict or object to processing 107 | - Port your data 108 | - File a complaint (EEA/UK) 109 | - Opt out (California residents) 110 | 111 | **Data Export**: 112 | 113 | - Available in JSON or CSV format 114 | - Requests fulfilled within 30 days 115 | - Includes: account info, queries, timestamps 116 | - Excludes: operational data, third-party responses, aggregated metrics 117 | 118 | Email: **[email protected]** 119 | 120 | --- 121 | 122 | ## 11. Children's Privacy 123 | 124 | The Service is not intended for use by anyone under **16 years old**. 125 | 126 | --- 127 | 128 | ## 12. Policy Changes 129 | 130 | We will update this document at `/privacy` with an updated Effective Date. 131 | Material changes will be announced by email. 132 | Version history maintained at: [github.com/genomoncology/biomcp/blob/main/docs/biomcp-privacy.md](https://github.com/genomoncology/biomcp/blob/main/docs/biomcp-privacy.md) 133 | 134 | --- 135 | 136 | ## 13. Contact 137 | 138 | **Data Protection Officer** 139 | 📧 **[email protected]** 140 | 📮 GenomOncology LLC – Privacy Office 141 | 1138 West 9th Street, Suite 400 142 | Cleveland, OH 44113 143 | 144 | # Security Policy 145 | 146 | ## Reporting a Vulnerability 147 | 148 | We take the security of biomcp seriously. If you believe you have found a security vulnerability, please report it to us as described below. 149 | 150 | ### Please do NOT: 151 | 152 | - Open a public GitHub issue 153 | - Discuss the vulnerability publicly before it has been addressed 154 | 155 | ### Please DO: 156 | 157 | - Email us at **[email protected]** 158 | - Include the word "SECURITY" in the subject line 159 | - Provide detailed steps to reproduce the vulnerability 160 | - Include the impact and potential attack scenarios 161 | 162 | ### What to expect: 163 | 164 | - **Acknowledgment**: Within 24 hours 165 | - **Initial Assessment**: Within 72 hours 166 | - **Status Updates**: At least every 5 business days 167 | - **Resolution Target**: Critical issues within 30 days 168 | 169 | ### Scope 170 | 171 | Vulnerabilities in the following areas are in scope: 172 | 173 | - Authentication bypass or privilege escalation 174 | - Data exposure or unauthorized access to user queries 175 | - Injection vulnerabilities (SQL, command, etc.) 176 | - Cross-site scripting (XSS) or request forgery (CSRF) 177 | - Denial of service vulnerabilities 178 | - Insecure cryptographic implementations 179 | - Third-party API key exposure 180 | 181 | ### Out of Scope: 182 | 183 | - Vulnerabilities in third-party services (PubMed, bioRxiv, etc.) 184 | - Issues in dependencies with existing patches 185 | - Social engineering attacks 186 | - Physical attacks 187 | - Attacks requiring authenticated admin access 188 | 189 | ## Disclosure Policy 190 | 191 | - We will work with you to understand and validate the issue 192 | - We will prepare a fix and release it as soon as possible 193 | - We will publicly disclose the vulnerability after the fix is released 194 | - We will credit you for the discovery (unless you prefer to remain anonymous) 195 | 196 | ## Safe Harbor 197 | 198 | Any activities conducted in a manner consistent with this policy will be considered authorized conduct, and we will not initiate legal action against you. If legal action is initiated by a third party against you in connection with activities conducted under this policy, we will take steps to make it known that your actions were conducted in compliance with this policy. 199 | 200 | ## Contact 201 | 202 | **Security Team Email**: [email protected] 203 | **PGP Key**: Available upon request 204 | 205 | Thank you for helping keep biomcp and our users safe! 206 | 207 | # GenomOncology Remote MCP 208 | 209 | **Terms of Service** 210 | **Version 1.2 – Effective June 18, 2025** 211 | 212 | > This document applies to the **hosted Remote MCP service** (the "Service") provided by **GenomOncology LLC**. 213 | > 214 | > For use of the **open-source code** available at [https://github.com/genomoncology/biomcp](https://github.com/genomoncology/biomcp), refer to the repository's LICENSE file (e.g., MIT License). 215 | 216 | --- 217 | 218 | ## 1. Definitions 219 | 220 | | Term | Meaning | 221 | | --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 222 | | **Service** | The hosted Model Context Protocol (MCP) instance available via Cloudflare and secured by Google OAuth. | 223 | | **User Content** | Prompts, messages, files, code, or other material submitted by you. | 224 | | **Output** | Model-generated text or data produced in response to your User Content. | 225 | | **Personal Data** | Information that identifies or relates to an identifiable individual, including Google account identifiers and query text. | 226 | | **Commercial Use** | Any use that directly or indirectly generates revenue, including but not limited to: selling access, integrating into paid products, or using for business operations. | 227 | | **Academic Research** | Non-commercial research conducted by accredited educational institutions for scholarly purposes. | 228 | 229 | --- 230 | 231 | ## 2. Eligibility & Accounts 232 | 233 | You must: 234 | 235 | - Be at least 16 years old 236 | - Have a valid Google account 237 | - Not be barred from receiving services under applicable law 238 | 239 | Authentication is handled via **Google OAuth**. Keep your credentials secure. 240 | 241 | --- 242 | 243 | ## 3. License & Intellectual Property 244 | 245 | You are granted a **limited, revocable, non-exclusive, non-transferable** license to use the Service for **internal research and non-commercial evaluation**. 246 | 247 | **Permitted Uses:** 248 | 249 | - Personal research and learning 250 | - Academic research (with attribution) 251 | - Evaluation for potential commercial licensing 252 | - Open-source development (non-commercial) 253 | 254 | **Prohibited Commercial Uses:** 255 | 256 | - Reselling or redistributing Service access 257 | - Integration into commercial products/services 258 | - Use in revenue-generating operations 259 | - Commercial data analysis or insights 260 | 261 | For commercial licensing inquiries, contact: **[email protected]** 262 | 263 | We retain all rights in the Service and its software. 264 | You retain ownership of your User Content, but grant us a royalty-free, worldwide license to use it (and the resulting Output) to provide, secure, and improve the Service. 265 | 266 | --- 267 | 268 | ## 4. Acceptable Use & Rate Limits 269 | 270 | You **must not**: 271 | 272 | 1. Violate any law or regulation 273 | 2. Reverse-engineer, scrape, or probe the Service or model weights 274 | 3. Exceed rate limits or disrupt the Service 275 | 276 | **Rate Limits:** 277 | 278 | - **Standard tier**: 100 requests per hour, 1000 per day 279 | - **Burst limit**: 10 requests per minute 280 | - **Payload size**: 50KB per request 281 | 282 | **Exceeding Limits:** 283 | 284 | - First violation: 1-hour suspension 285 | - Repeated violations: Account review and possible termination 286 | - Higher limits available upon request: **[email protected]** 287 | 288 | --- 289 | 290 | ## 5. Privacy, Logging & Improvement 291 | 292 | We store **Google user ID**, **email address**, and **query text** with **timestamps** in **Google BigQuery**. This data is analyzed to: 293 | 294 | - Operate and secure the Service 295 | - Improve system performance and user experience 296 | - Tune models and develop features 297 | - Generate usage analytics 298 | 299 | **Note**: We process but do not retain operational data like IP addresses or user-agents. Third-party API responses are used in real-time but not stored. 300 | 301 | See our [Privacy Policy](https://github.com/genomoncology/biomcp/blob/main/docs/biomcp-privacy.md) for details. 302 | 303 | --- 304 | 305 | ## 6. Third‑Party Services 306 | 307 | The Service queries third-party APIs and knowledge sources (e.g., **PubMed, bioRxiv, TCGA, 1000 Genomes**) to respond to user prompts. 308 | 309 | **Important:** 310 | 311 | - Your queries are transmitted to these services 312 | - Third-party services have independent terms and privacy policies 313 | - We cannot guarantee their availability, accuracy, or uptime 314 | - Third parties may retain your query data per their policies 315 | - API responses are used to generate output but not stored by us 316 | 317 | You acknowledge that third-party content is subject to their respective licenses and terms. 318 | 319 | --- 320 | 321 | ## 7. Disclaimers 322 | 323 | - **AI Output:** May be inaccurate or biased. **Do not rely on it for medical or legal decisions.** 324 | - **AS‑IS:** The Service is provided _"as is"_ with no warranties or guarantees. 325 | - **Third-Party Content:** We are not responsible for accuracy or availability of third-party data. 326 | 327 | --- 328 | 329 | ## 8. Limitation of Liability 330 | 331 | To the extent permitted by law, **GenomOncology** is not liable for indirect, incidental, or consequential damages, including: 332 | 333 | - Data loss 334 | - Business interruption 335 | - Inaccurate output 336 | - Third-party service failures 337 | 338 | --- 339 | 340 | ## 9. Indemnification 341 | 342 | You agree to indemnify and hold GenomOncology harmless from any claim resulting from your misuse of the Service. 343 | 344 | --- 345 | 346 | ## 10. Termination 347 | 348 | We may suspend or terminate access at any time. Upon termination: 349 | 350 | - Your license ends immediately 351 | - We retain stored data (username & queries) per our Privacy Policy 352 | - You may request data export within 30 days 353 | 354 | --- 355 | 356 | ## 11. Governing Law & Dispute Resolution 357 | 358 | These Terms are governed by the laws of **Ohio, USA**. 359 | Disputes will be resolved via binding arbitration in **Cuyahoga County, Ohio**, under **JAMS Streamlined Rules**. 360 | 361 | --- 362 | 363 | ## 12. Changes 364 | 365 | We may update these Terms by posting to `/terms`. 366 | Material changes will be emailed. Continued use constitutes acceptance. 367 | Version history: [github.com/genomoncology/biomcp/blob/main/docs/biomcp-terms.md](https://github.com/genomoncology/biomcp/blob/main/docs/biomcp-terms.md) 368 | 369 | --- 370 | 371 | ## 13. Security & Vulnerability Reporting 372 | 373 | Found a security issue? Please report it responsibly: 374 | 375 | - Email: **[email protected]** 376 | - See: [SECURITY.md](https://github.com/genomoncology/biomcp/blob/main/SECURITY.md) 377 | 378 | --- 379 | 380 | ## 14. Contact 381 | 382 | GenomOncology LLC 383 | 1138 West 9th Street, Suite 400 384 | Cleveland, OH 44113 385 | 📧 **[email protected]** 386 | 387 | --- 388 | 389 | ## Appendix A – Acceptable Use Policy (AUP) 390 | 391 | - Do not submit illegal, harassing, or hateful content 392 | - Do not generate malware, spam, or scrape personal data 393 | - Respect copyright and IP laws 394 | - Do not attempt to re-identify individuals from model output 395 | - Do not use the Service to process protected health information (PHI) 396 | - Do not submit personally identifiable genetic data 397 | ``` -------------------------------------------------------------------------------- /tests/bdd/steps/test_alphagenome_steps.py: -------------------------------------------------------------------------------- ```python 1 | """Step definitions for AlphaGenome integration BDD tests.""" 2 | 3 | import asyncio 4 | import os 5 | from unittest.mock import MagicMock, patch 6 | 7 | import pandas as pd 8 | import pytest 9 | from pytest_bdd import given, parsers, scenarios, then, when 10 | 11 | from biomcp.variants.alphagenome import predict_variant_effects 12 | 13 | # Load all scenarios from the feature file 14 | scenarios("../features/alphagenome_integration.feature") 15 | 16 | 17 | @pytest.fixture 18 | def alphagenome_context(): 19 | """Fixture to maintain test context.""" 20 | context = {} 21 | yield context 22 | # Cleanup: restore original API key if it was stored 23 | if "original_key" in context: 24 | if context["original_key"] is None: 25 | os.environ.pop("ALPHAGENOME_API_KEY", None) 26 | else: 27 | os.environ["ALPHAGENOME_API_KEY"] = context["original_key"] 28 | 29 | 30 | @given("the AlphaGenome integration is available") 31 | def alphagenome_available(): 32 | """Set up the basic AlphaGenome environment.""" 33 | pass 34 | 35 | 36 | @given("the ALPHAGENOME_API_KEY is not set") 37 | def no_api_key(alphagenome_context): 38 | """Ensure API key is not set.""" 39 | # Store original key if it exists 40 | alphagenome_context["original_key"] = os.environ.get("ALPHAGENOME_API_KEY") 41 | if "ALPHAGENOME_API_KEY" in os.environ: 42 | del os.environ["ALPHAGENOME_API_KEY"] 43 | 44 | 45 | @given("the AlphaGenome API returns an error") 46 | def api_error(alphagenome_context): 47 | """Set up to simulate API error.""" 48 | alphagenome_context["simulate_error"] = True 49 | 50 | 51 | @when(parsers.parse("I request predictions for variant {variant}")) 52 | def request_prediction(alphagenome_context, variant): 53 | """Request variant effect prediction.""" 54 | # Parse variant notation (chr:pos ref>alt) 55 | parts = variant.split() 56 | chr_pos = parts[0] 57 | alleles = parts[1] if len(parts) > 1 else "A>T" 58 | 59 | chromosome, position = chr_pos.split(":") 60 | reference, alternate = alleles.split(">") 61 | 62 | try: 63 | if alphagenome_context.get("simulate_error"): 64 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): 65 | # Mock to simulate API error 66 | mock_client = MagicMock() 67 | mock_client.create.side_effect = Exception( 68 | "API connection failed" 69 | ) 70 | 71 | with patch.dict( 72 | "sys.modules", 73 | { 74 | "alphagenome.data": MagicMock(genome=MagicMock()), 75 | "alphagenome.models": MagicMock( 76 | dna_client=mock_client 77 | ), 78 | }, 79 | ): 80 | result = asyncio.run( 81 | predict_variant_effects( 82 | chromosome, int(position), reference, alternate 83 | ) 84 | ) 85 | else: 86 | # Check if we should skip cache 87 | skip_cache = alphagenome_context.get("skip_cache", False) 88 | result = asyncio.run( 89 | predict_variant_effects( 90 | chromosome, 91 | int(position), 92 | reference, 93 | alternate, 94 | skip_cache=skip_cache, 95 | ) 96 | ) 97 | except ValueError as e: 98 | # For validation errors, store the error message as the result 99 | result = str(e) 100 | alphagenome_context["error"] = True 101 | 102 | alphagenome_context["result"] = result 103 | alphagenome_context["variant"] = variant 104 | 105 | 106 | @when("I request predictions for any variant") 107 | def request_any_prediction(alphagenome_context): 108 | """Request prediction for a test variant.""" 109 | # Force skip cache to ensure we test the actual API key state 110 | alphagenome_context["skip_cache"] = True 111 | request_prediction(alphagenome_context, "chr7:140753336 A>T") 112 | 113 | 114 | @when( 115 | parsers.parse( 116 | "I request predictions for variant {variant} with threshold {threshold:f}" 117 | ) 118 | ) 119 | def request_prediction_with_threshold(alphagenome_context, variant, threshold): 120 | """Request prediction with custom threshold.""" 121 | # Set up mocks for successful prediction 122 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): 123 | mock_genome = MagicMock() 124 | mock_client = MagicMock() 125 | mock_scorers = MagicMock() 126 | 127 | # Mock successful flow 128 | mock_model = MagicMock() 129 | mock_client.create.return_value = mock_model 130 | 131 | # Create test scores with various values 132 | test_scores_df = pd.DataFrame({ 133 | "output_type": ["RNA_SEQ", "RNA_SEQ", "ATAC", "SPLICE"], 134 | "raw_score": [0.2, 0.4, -0.35, 0.6], 135 | "gene_name": ["GENE1", "GENE2", None, None], 136 | "track_name": [None, None, "tissue1", None], 137 | }) 138 | 139 | mock_scorers.tidy_scores.return_value = test_scores_df 140 | mock_scorers.get_recommended_scorers.return_value = [] 141 | 142 | with patch.dict( 143 | "sys.modules", 144 | { 145 | "alphagenome.data": MagicMock(genome=mock_genome), 146 | "alphagenome.models": MagicMock( 147 | dna_client=mock_client, variant_scorers=mock_scorers 148 | ), 149 | }, 150 | ): 151 | # Parse variant 152 | parts = variant.split() 153 | chr_pos = parts[0] 154 | alleles = parts[1] 155 | chromosome, position = chr_pos.split(":") 156 | reference, alternate = alleles.split(">") 157 | 158 | result = asyncio.run( 159 | predict_variant_effects( 160 | chromosome, 161 | int(position), 162 | reference, 163 | alternate, 164 | significance_threshold=threshold, 165 | ) 166 | ) 167 | 168 | alphagenome_context["result"] = result 169 | alphagenome_context["threshold"] = threshold 170 | 171 | 172 | @when(parsers.parse("I request predictions with interval size {size:d}")) 173 | def request_with_interval_size(alphagenome_context, size): 174 | """Request prediction with specific interval size.""" 175 | result = asyncio.run( 176 | predict_variant_effects( 177 | "chr7", 140753336, "A", "T", interval_size=size 178 | ) 179 | ) 180 | alphagenome_context["result"] = result 181 | alphagenome_context["interval_size"] = size 182 | 183 | 184 | @when( 185 | parsers.parse( 186 | "I request predictions for variant {variant} with tissue types {tissues}" 187 | ) 188 | ) 189 | def request_with_tissues(alphagenome_context, variant, tissues): 190 | """Request prediction with tissue types.""" 191 | # Parse variant 192 | parts = variant.split() 193 | chr_pos = parts[0] 194 | alleles = parts[1] 195 | chromosome, position = chr_pos.split(":") 196 | reference, alternate = alleles.split(">") 197 | 198 | # Parse tissue types 199 | tissue_list = [t.strip() for t in tissues.split(",")] 200 | 201 | result = asyncio.run( 202 | predict_variant_effects( 203 | chromosome, 204 | int(position), 205 | reference, 206 | alternate, 207 | tissue_types=tissue_list, 208 | ) 209 | ) 210 | 211 | alphagenome_context["result"] = result 212 | alphagenome_context["tissues"] = tissue_list 213 | 214 | 215 | @when("I request the same prediction again") 216 | def request_again(alphagenome_context): 217 | """Request the same prediction again to test caching.""" 218 | # Request the same variant again 219 | variant = alphagenome_context.get("variant", "chr7:140753336 A>T") 220 | request_prediction(alphagenome_context, variant) 221 | 222 | 223 | @then("the prediction should include gene expression effects") 224 | def check_gene_expression(alphagenome_context): 225 | """Check for gene expression section in results.""" 226 | result = alphagenome_context["result"] 227 | # For tests without API key, we'll get an error message 228 | assert ("Gene Expression" in result) or ("AlphaGenome" in result) 229 | 230 | 231 | @then("the prediction should include chromatin accessibility changes") 232 | def check_chromatin(alphagenome_context): 233 | """Check for chromatin accessibility section.""" 234 | result = alphagenome_context["result"] 235 | assert ("Chromatin Accessibility" in result) or ("AlphaGenome" in result) 236 | 237 | 238 | @then("the prediction should include a summary of affected tracks") 239 | def check_summary(alphagenome_context): 240 | """Check for summary section.""" 241 | result = alphagenome_context["result"] 242 | assert ("Summary" in result) or ("AlphaGenome" in result) 243 | 244 | 245 | @then("I should receive instructions on how to obtain an API key") 246 | def check_api_key_instructions(alphagenome_context): 247 | """Check for API key instructions.""" 248 | result = alphagenome_context["result"] 249 | assert "AlphaGenome API key required" in result 250 | assert "https://deepmind.google.com/science/alphagenome" in result 251 | assert "ACTION REQUIRED" in result 252 | 253 | 254 | @then( 255 | "the response should mention that standard annotations are still available" 256 | ) 257 | def check_standard_annotations(alphagenome_context): 258 | """Check for mention of standard annotations.""" 259 | result = alphagenome_context["result"] 260 | # The new message doesn't mention standard annotations, but that's OK 261 | # as the focus is on getting the user to provide an API key 262 | assert "API key" in result 263 | 264 | 265 | @then("I should receive an error about invalid chromosome format") 266 | def check_chromosome_error(alphagenome_context): 267 | """Check for chromosome format error.""" 268 | result = alphagenome_context["result"] 269 | assert "Invalid chromosome format" in result 270 | 271 | 272 | @then("the error should specify the expected format") 273 | def check_format_specification(alphagenome_context): 274 | """Check that error specifies expected format.""" 275 | result = alphagenome_context["result"] 276 | assert "Expected format: chr1-22, chrX, chrY, chrM, or chrMT" in result 277 | 278 | 279 | @then("I should receive an error about invalid nucleotides") 280 | def check_nucleotide_error(alphagenome_context): 281 | """Check for nucleotide validation error.""" 282 | result = alphagenome_context["result"] 283 | assert "Invalid nucleotides" in result 284 | 285 | 286 | @then("the error should specify that only A, C, G, T are allowed") 287 | def check_nucleotide_specification(alphagenome_context): 288 | """Check that error specifies valid nucleotides.""" 289 | result = alphagenome_context["result"] 290 | assert "Only A, C, G, T are allowed" in result 291 | 292 | 293 | @then("the summary should reflect the custom threshold value") 294 | def check_custom_threshold(alphagenome_context): 295 | """Check that custom threshold is used.""" 296 | result = alphagenome_context["result"] 297 | threshold = alphagenome_context["threshold"] 298 | assert f"|log₂| > {threshold}" in result 299 | 300 | 301 | @then("more tracks should be marked as significant compared to default") 302 | def check_threshold_effect(alphagenome_context): 303 | """Check that lower threshold identifies more significant tracks.""" 304 | result = alphagenome_context["result"] 305 | # With threshold 0.3, we should see 3 tracks as significant 306 | assert "3 tracks show substantial changes" in result 307 | 308 | 309 | @then("the system should use the maximum supported size of 1048576") 310 | def check_max_interval(alphagenome_context): 311 | """Check that oversized intervals are capped.""" 312 | # This is handled internally, result should still work 313 | result = alphagenome_context["result"] 314 | assert "AlphaGenome" in result 315 | 316 | 317 | @then("the prediction should complete successfully") 318 | def check_success(alphagenome_context): 319 | """Check that prediction completed.""" 320 | result = alphagenome_context["result"] 321 | assert result is not None 322 | 323 | 324 | @then("the second request should return cached results") 325 | def check_cached(alphagenome_context): 326 | """Check that results are cached.""" 327 | # Both results should be identical 328 | result = alphagenome_context["result"] 329 | assert result is not None 330 | 331 | 332 | @then("the response time should be significantly faster") 333 | def check_faster(alphagenome_context): 334 | """Check that cached response is faster.""" 335 | # In real implementation, we'd measure time 336 | pass 337 | 338 | 339 | @then("the prediction should consider tissue-specific effects") 340 | def check_tissue_effects(alphagenome_context): 341 | """Check for tissue-specific considerations.""" 342 | result = alphagenome_context["result"] 343 | assert "AlphaGenome" in result 344 | 345 | 346 | @then("the context should show the specified tissue types") 347 | def check_tissue_context(alphagenome_context): 348 | """Check that tissue types are shown in context.""" 349 | result = alphagenome_context["result"] 350 | tissues = alphagenome_context.get("tissues", []) 351 | # Check if tissues are mentioned (in error context or results) 352 | for tissue in tissues: 353 | assert (tissue in result) or ("AlphaGenome" in result) 354 | 355 | 356 | @then("I should receive a detailed error message") 357 | def check_detailed_error(alphagenome_context): 358 | """Check for detailed error message.""" 359 | result = alphagenome_context["result"] 360 | # Either not installed, API key error, prediction failed error, or actual predictions (if API is available) 361 | assert ( 362 | ("AlphaGenome not installed" in result) 363 | or ("AlphaGenome prediction failed" in result) 364 | or ("AlphaGenome API key required" in result) 365 | or ("AlphaGenome Variant Effect Predictions" in result) 366 | ) 367 | 368 | 369 | @then("the error should include the variant context") 370 | def check_error_context(alphagenome_context): 371 | """Check that error includes variant details.""" 372 | result = alphagenome_context["result"] 373 | # Context is only in prediction failed errors, not API key errors or not installed errors 374 | if "AlphaGenome prediction failed" in result: 375 | assert "Context:" in result 376 | assert "chr7:140753336 A>T" in result 377 | 378 | 379 | @then("the error should include the analysis parameters") 380 | def check_error_parameters(alphagenome_context): 381 | """Check that error includes parameters.""" 382 | result = alphagenome_context["result"] 383 | # Parameters are only in prediction failed errors, not API key errors 384 | if "AlphaGenome prediction failed" in result: 385 | assert "Interval size:" in result 386 | assert "bp" in result 387 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_unified_biothings.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for unified search/fetch with BioThings domains.""" 2 | 3 | import json 4 | 5 | import pytest 6 | 7 | from biomcp.router import fetch, search 8 | 9 | 10 | class TestUnifiedBioThingsSearch: 11 | """Test unified search with BioThings domains.""" 12 | 13 | @pytest.mark.asyncio 14 | async def test_search_gene_domain(self, monkeypatch): 15 | """Test searching genes through unified search.""" 16 | # Mock the BioThingsClient 17 | mock_gene_query = [{"_id": "673", "symbol": "BRAF"}] 18 | mock_gene_details = { 19 | "_id": "673", 20 | "symbol": "BRAF", 21 | "name": "B-Raf proto-oncogene, serine/threonine kinase", 22 | "summary": "This gene encodes a protein belonging to the RAF family...", 23 | "entrezgene": 673, 24 | } 25 | 26 | class MockBioThingsClient: 27 | async def _query_gene(self, query): 28 | return mock_gene_query 29 | 30 | async def _get_gene_by_id(self, gene_id): 31 | from biomcp.integrations.biothings_client import GeneInfo 32 | 33 | return GeneInfo(**mock_gene_details) 34 | 35 | monkeypatch.setattr( 36 | "biomcp.router.BioThingsClient", MockBioThingsClient 37 | ) 38 | 39 | # Test gene search 40 | results = await search(query="", domain="gene", keywords=["BRAF"]) 41 | 42 | assert "results" in results 43 | # Skip thinking reminder if present 44 | actual_results = [ 45 | r for r in results["results"] if r["id"] != "thinking-reminder" 46 | ] 47 | assert len(actual_results) == 1 48 | assert actual_results[0]["id"] == "673" 49 | assert "BRAF" in actual_results[0]["title"] 50 | 51 | @pytest.mark.asyncio 52 | async def test_search_drug_domain(self, monkeypatch): 53 | """Test searching drugs through unified search.""" 54 | # Mock the BioThingsClient 55 | mock_drug_query = [{"_id": "CHEMBL941"}] 56 | mock_drug_details = { 57 | "_id": "CHEMBL941", 58 | "name": "Imatinib", 59 | "drugbank_id": "DB00619", 60 | "description": "Imatinib is a tyrosine kinase inhibitor...", 61 | "indication": "Treatment of chronic myeloid leukemia...", 62 | } 63 | 64 | class MockBioThingsClient: 65 | async def _query_drug(self, query): 66 | return mock_drug_query 67 | 68 | async def _get_drug_by_id(self, drug_id): 69 | from biomcp.integrations.biothings_client import DrugInfo 70 | 71 | return DrugInfo(**mock_drug_details) 72 | 73 | monkeypatch.setattr( 74 | "biomcp.router.BioThingsClient", MockBioThingsClient 75 | ) 76 | 77 | # Test drug search 78 | results = await search(query="", domain="drug", keywords=["imatinib"]) 79 | 80 | assert "results" in results 81 | # Skip thinking reminder if present 82 | actual_results = [ 83 | r for r in results["results"] if r["id"] != "thinking-reminder" 84 | ] 85 | assert len(actual_results) == 1 86 | assert actual_results[0]["id"] == "CHEMBL941" 87 | assert "Imatinib" in actual_results[0]["title"] 88 | 89 | @pytest.mark.asyncio 90 | async def test_search_disease_domain(self, monkeypatch): 91 | """Test searching diseases through unified search.""" 92 | # Mock the BioThingsClient 93 | mock_disease_query = [{"_id": "MONDO:0005105"}] 94 | mock_disease_details = { 95 | "_id": "MONDO:0005105", 96 | "name": "melanoma", 97 | "definition": "A malignant neoplasm composed of melanocytes.", 98 | "mondo": {"id": "MONDO:0005105"}, 99 | "phenotypes": [], 100 | } 101 | 102 | class MockBioThingsClient: 103 | async def _query_disease(self, query): 104 | return mock_disease_query 105 | 106 | async def _get_disease_by_id(self, disease_id): 107 | from biomcp.integrations.biothings_client import DiseaseInfo 108 | 109 | return DiseaseInfo(**mock_disease_details) 110 | 111 | monkeypatch.setattr( 112 | "biomcp.router.BioThingsClient", MockBioThingsClient 113 | ) 114 | 115 | # Test disease search 116 | results = await search( 117 | query="", domain="disease", keywords=["melanoma"] 118 | ) 119 | 120 | assert "results" in results 121 | # Skip thinking reminder if present 122 | actual_results = [ 123 | r for r in results["results"] if r["id"] != "thinking-reminder" 124 | ] 125 | assert len(actual_results) == 1 126 | assert actual_results[0]["id"] == "MONDO:0005105" 127 | assert "melanoma" in actual_results[0]["title"] 128 | 129 | 130 | class TestUnifiedBioThingsFetch: 131 | """Test unified fetch with BioThings domains.""" 132 | 133 | @pytest.mark.asyncio 134 | async def test_fetch_gene(self, monkeypatch): 135 | """Test fetching gene information.""" 136 | mock_gene_info = { 137 | "_id": "673", 138 | "symbol": "BRAF", 139 | "name": "B-Raf proto-oncogene, serine/threonine kinase", 140 | "summary": "This gene encodes a protein belonging to the RAF family...", 141 | "entrezgene": 673, 142 | "type_of_gene": "protein-coding", 143 | "alias": ["BRAF1", "B-RAF1"], 144 | } 145 | 146 | class MockBioThingsClient: 147 | async def get_gene_info(self, gene_id): 148 | from biomcp.integrations.biothings_client import GeneInfo 149 | 150 | return GeneInfo(**mock_gene_info) 151 | 152 | monkeypatch.setattr( 153 | "biomcp.router.BioThingsClient", MockBioThingsClient 154 | ) 155 | 156 | # Test gene fetch 157 | result = await fetch(id="BRAF", domain="gene") 158 | 159 | assert result["id"] == "673" 160 | assert "BRAF" in result["title"] 161 | assert "B-Raf proto-oncogene" in result["title"] 162 | assert "Entrez ID: 673" in result["text"] 163 | assert "Type: protein-coding" in result["text"] 164 | 165 | @pytest.mark.asyncio 166 | async def test_fetch_drug(self, monkeypatch): 167 | """Test fetching drug information.""" 168 | mock_drug_info = { 169 | "_id": "CHEMBL941", 170 | "name": "Imatinib", 171 | "drugbank_id": "DB00619", 172 | "description": "Imatinib is a tyrosine kinase inhibitor...", 173 | "indication": "Treatment of chronic myeloid leukemia...", 174 | "mechanism_of_action": "Inhibits BCR-ABL tyrosine kinase...", 175 | "tradename": ["Gleevec", "Glivec"], 176 | "formula": "C29H31N7O", 177 | } 178 | 179 | class MockBioThingsClient: 180 | async def get_drug_info(self, drug_id): 181 | from biomcp.integrations.biothings_client import DrugInfo 182 | 183 | return DrugInfo(**mock_drug_info) 184 | 185 | monkeypatch.setattr( 186 | "biomcp.router.BioThingsClient", MockBioThingsClient 187 | ) 188 | 189 | # Test drug fetch 190 | result = await fetch(id="imatinib", domain="drug") 191 | 192 | assert result["id"] == "CHEMBL941" 193 | assert "Imatinib" in result["title"] 194 | assert "DrugBank ID: DB00619" in result["text"] 195 | assert "Formula: C29H31N7O" in result["text"] 196 | assert "Trade Names: Gleevec, Glivec" in result["text"] 197 | 198 | @pytest.mark.asyncio 199 | async def test_fetch_disease(self, monkeypatch): 200 | """Test fetching disease information.""" 201 | mock_disease_info = { 202 | "_id": "MONDO:0005105", 203 | "name": "melanoma", 204 | "definition": "A malignant neoplasm composed of melanocytes.", 205 | "mondo": {"id": "MONDO:0005105"}, 206 | "synonyms": [ 207 | "malignant melanoma", 208 | "melanoma, malignant", 209 | "melanosarcoma", 210 | ], 211 | "phenotypes": [{"hp": "HP:0002861"}], 212 | } 213 | 214 | class MockBioThingsClient: 215 | async def get_disease_info(self, disease_id): 216 | from biomcp.integrations.biothings_client import DiseaseInfo 217 | 218 | return DiseaseInfo(**mock_disease_info) 219 | 220 | monkeypatch.setattr( 221 | "biomcp.router.BioThingsClient", MockBioThingsClient 222 | ) 223 | 224 | # Test disease fetch 225 | result = await fetch(id="melanoma", domain="disease") 226 | 227 | assert result["id"] == "MONDO:0005105" 228 | assert "melanoma" in result["title"] 229 | assert "MONDO ID: MONDO:0005105" in result["text"] 230 | assert "Definition:" in result["text"] 231 | assert "Synonyms:" in result["text"] 232 | assert "Associated Phenotypes: 1" in result["text"] 233 | 234 | 235 | class TestUnifiedQueryLanguage: 236 | """Test unified query language with BioThings domains.""" 237 | 238 | @pytest.mark.asyncio 239 | async def test_cross_domain_gene_search(self, monkeypatch): 240 | """Test that gene searches include gene domain.""" 241 | # Mock multiple domain searches 242 | searched_domains = [] 243 | 244 | async def mock_execute_routing_plan(plan, output_json=True): 245 | searched_domains.extend(plan.tools_to_call) 246 | return { 247 | "articles": json.dumps([]), 248 | "variants": json.dumps([]), 249 | "genes": json.dumps([]), 250 | "trials": json.dumps([]), 251 | } 252 | 253 | monkeypatch.setattr( 254 | "biomcp.router.execute_routing_plan", mock_execute_routing_plan 255 | ) 256 | 257 | # Test cross-domain gene search 258 | await search(query="gene:BRAF") 259 | 260 | assert "gene_searcher" in searched_domains 261 | assert "article_searcher" in searched_domains 262 | assert "variant_searcher" in searched_domains 263 | 264 | @pytest.mark.asyncio 265 | async def test_cross_domain_disease_search(self, monkeypatch): 266 | """Test that disease searches include disease domain.""" 267 | # Mock multiple domain searches 268 | searched_domains = [] 269 | 270 | async def mock_execute_routing_plan(plan, output_json=True): 271 | searched_domains.extend(plan.tools_to_call) 272 | return { 273 | "articles": json.dumps([]), 274 | "trials": json.dumps([]), 275 | "diseases": json.dumps([]), 276 | } 277 | 278 | monkeypatch.setattr( 279 | "biomcp.router.execute_routing_plan", mock_execute_routing_plan 280 | ) 281 | 282 | # Test cross-domain disease search 283 | await search(query="disease:melanoma") 284 | 285 | assert "disease_searcher" in searched_domains 286 | assert "article_searcher" in searched_domains 287 | assert "trial_searcher" in searched_domains 288 | 289 | @pytest.mark.asyncio 290 | async def test_domain_specific_query(self, monkeypatch): 291 | """Test domain-specific query language.""" 292 | # Mock execute routing plan 293 | searched_domains = [] 294 | 295 | async def mock_execute_routing_plan(plan, output_json=True): 296 | searched_domains.extend(plan.tools_to_call) 297 | return {"genes": json.dumps([])} 298 | 299 | monkeypatch.setattr( 300 | "biomcp.router.execute_routing_plan", mock_execute_routing_plan 301 | ) 302 | 303 | # Test gene-specific search 304 | await search(query="genes.symbol:BRAF") 305 | 306 | assert "gene_searcher" in searched_domains 307 | assert len(searched_domains) == 1 # Only gene domain searched 308 | 309 | 310 | class TestBioThingsErrorCases: 311 | """Test error handling for BioThings integration.""" 312 | 313 | @pytest.mark.asyncio 314 | async def test_gene_api_failure(self, monkeypatch): 315 | """Test handling of API failures for gene search.""" 316 | 317 | class MockBioThingsClient: 318 | async def _query_gene(self, query): 319 | raise Exception("API connection failed") 320 | 321 | monkeypatch.setattr( 322 | "biomcp.router.BioThingsClient", MockBioThingsClient 323 | ) 324 | 325 | # Test that search handles the error gracefully 326 | with pytest.raises(Exception) as exc_info: 327 | await search(query="", domain="gene", keywords=["BRAF"]) 328 | 329 | assert "API connection failed" in str(exc_info.value) 330 | 331 | @pytest.mark.asyncio 332 | async def test_drug_not_found(self, monkeypatch): 333 | """Test handling when drug is not found.""" 334 | 335 | class MockBioThingsClient: 336 | async def _query_drug(self, query): 337 | return [] # No results 338 | 339 | monkeypatch.setattr( 340 | "biomcp.router.BioThingsClient", MockBioThingsClient 341 | ) 342 | 343 | results = await search( 344 | query="", domain="drug", keywords=["nonexistent"] 345 | ) 346 | assert "results" in results 347 | actual_results = [ 348 | r for r in results["results"] if r["id"] != "thinking-reminder" 349 | ] 350 | assert len(actual_results) == 0 351 | 352 | @pytest.mark.asyncio 353 | async def test_disease_invalid_id(self, monkeypatch): 354 | """Test handling of invalid disease ID in fetch.""" 355 | 356 | class MockBioThingsClient: 357 | async def get_disease_info(self, disease_id): 358 | return None # Not found 359 | 360 | monkeypatch.setattr( 361 | "biomcp.router.BioThingsClient", MockBioThingsClient 362 | ) 363 | 364 | result = await fetch(id="INVALID:12345", domain="disease") 365 | assert "error" in result 366 | assert "not found" in result["error"].lower() 367 | 368 | @pytest.mark.asyncio 369 | async def test_gene_partial_data(self, monkeypatch): 370 | """Test handling of incomplete gene data.""" 371 | mock_gene_query = [{"_id": "673"}] # Missing symbol 372 | mock_gene_details = { 373 | "_id": "673", 374 | # Missing symbol, name, summary 375 | "entrezgene": 673, 376 | } 377 | 378 | class MockBioThingsClient: 379 | async def _query_gene(self, query): 380 | return mock_gene_query 381 | 382 | async def _get_gene_by_id(self, gene_id): 383 | from biomcp.integrations.biothings_client import GeneInfo 384 | 385 | return GeneInfo(**mock_gene_details) 386 | 387 | monkeypatch.setattr( 388 | "biomcp.router.BioThingsClient", MockBioThingsClient 389 | ) 390 | 391 | results = await search(query="", domain="gene", keywords=["673"]) 392 | assert "results" in results 393 | actual_results = [ 394 | r for r in results["results"] if r["id"] != "thinking-reminder" 395 | ] 396 | assert len(actual_results) == 1 397 | # Should handle missing data gracefully 398 | assert actual_results[0]["id"] == "673" 399 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_nci_mcp_tools.py: -------------------------------------------------------------------------------- ```python 1 | """Test NCI-specific MCP tools.""" 2 | 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | 7 | from biomcp.individual_tools import ( 8 | nci_intervention_getter, 9 | nci_intervention_searcher, 10 | nci_organization_getter, 11 | nci_organization_searcher, 12 | ) 13 | 14 | 15 | class TestOrganizationTools: 16 | """Test organization MCP tools.""" 17 | 18 | @pytest.mark.asyncio 19 | async def test_organization_searcher_tool(self): 20 | """Test organization searcher MCP tool.""" 21 | mock_results = { 22 | "total": 2, 23 | "organizations": [ 24 | { 25 | "id": "ORG001", 26 | "name": "Test Cancer Center", 27 | "type": "Academic", 28 | "city": "Boston", 29 | "state": "MA", 30 | "country": "US", 31 | }, 32 | { 33 | "id": "ORG002", 34 | "name": "Another Cancer Center", 35 | "type": "Academic", 36 | "city": "New York", 37 | "state": "NY", 38 | "country": "US", 39 | }, 40 | ], 41 | } 42 | 43 | with ( 44 | patch("biomcp.organizations.search_organizations") as mock_search, 45 | patch( 46 | "biomcp.organizations.search.format_organization_results" 47 | ) as mock_format, 48 | ): 49 | mock_search.return_value = mock_results 50 | mock_format.return_value = ( 51 | "## Organization Search Results\n\nFound 2 organizations" 52 | ) 53 | 54 | result = await nci_organization_searcher( 55 | name="Cancer Center", 56 | organization_type="Academic", 57 | city="Boston", 58 | api_key="test-key", 59 | ) 60 | 61 | assert "Found 2 organizations" in result 62 | mock_search.assert_called_once_with( 63 | name="Cancer Center", 64 | org_type="Academic", 65 | city="Boston", 66 | state=None, 67 | page_size=20, 68 | page=1, 69 | api_key="test-key", 70 | ) 71 | 72 | @pytest.mark.asyncio 73 | async def test_organization_getter_tool(self): 74 | """Test organization getter MCP tool.""" 75 | mock_org = { 76 | "id": "ORG001", 77 | "name": "Test Cancer Center", 78 | "type": "Academic", 79 | "address": { 80 | "street": "123 Medical Way", 81 | "city": "Boston", 82 | "state": "MA", 83 | "zip": "02115", 84 | "country": "US", 85 | }, 86 | "contact": {"phone": "555-1234", "email": "[email protected]"}, 87 | } 88 | 89 | with ( 90 | patch("biomcp.organizations.get_organization") as mock_get, 91 | patch( 92 | "biomcp.organizations.getter.format_organization_details" 93 | ) as mock_format, 94 | ): 95 | mock_get.return_value = mock_org 96 | mock_format.return_value = ( 97 | "## Test Cancer Center\n\nType: Academic\nLocation: Boston, MA" 98 | ) 99 | 100 | result = await nci_organization_getter( 101 | organization_id="ORG001", api_key="test-key" 102 | ) 103 | 104 | assert "Test Cancer Center" in result 105 | assert "Academic" in result 106 | mock_get.assert_called_once_with( 107 | org_id="ORG001", 108 | api_key="test-key", 109 | ) 110 | 111 | 112 | class TestInterventionTools: 113 | """Test intervention MCP tools.""" 114 | 115 | @pytest.mark.asyncio 116 | async def test_intervention_searcher_tool(self): 117 | """Test intervention searcher MCP tool.""" 118 | mock_results = { 119 | "total": 1, 120 | "interventions": [ 121 | { 122 | "id": "INT001", 123 | "name": "Pembrolizumab", 124 | "type": "Drug", 125 | "synonyms": ["Keytruda", "MK-3475"], 126 | } 127 | ], 128 | } 129 | 130 | with ( 131 | patch("biomcp.interventions.search_interventions") as mock_search, 132 | patch( 133 | "biomcp.interventions.search.format_intervention_results" 134 | ) as mock_format, 135 | ): 136 | mock_search.return_value = mock_results 137 | mock_format.return_value = ( 138 | "## Intervention Search Results\n\nFound 1 intervention" 139 | ) 140 | 141 | result = await nci_intervention_searcher( 142 | name="pembrolizumab", 143 | intervention_type="Drug", 144 | api_key="test-key", 145 | ) 146 | 147 | assert "Found 1 intervention" in result 148 | mock_search.assert_called_once_with( 149 | name="pembrolizumab", 150 | intervention_type="Drug", 151 | synonyms=True, 152 | page_size=None, 153 | page=1, 154 | api_key="test-key", 155 | ) 156 | 157 | @pytest.mark.asyncio 158 | async def test_intervention_getter_tool(self): 159 | """Test intervention getter MCP tool.""" 160 | mock_intervention = { 161 | "id": "INT001", 162 | "name": "Pembrolizumab", 163 | "type": "Drug", 164 | "category": "Immunotherapy", 165 | "synonyms": ["Keytruda", "MK-3475"], 166 | "mechanism": "PD-1 inhibitor", 167 | "fda_approved": True, 168 | } 169 | 170 | with ( 171 | patch("biomcp.interventions.get_intervention") as mock_get, 172 | patch( 173 | "biomcp.interventions.getter.format_intervention_details" 174 | ) as mock_format, 175 | ): 176 | mock_get.return_value = mock_intervention 177 | mock_format.return_value = ( 178 | "## Pembrolizumab\n\nType: Drug\nMechanism: PD-1 inhibitor" 179 | ) 180 | 181 | result = await nci_intervention_getter( 182 | intervention_id="INT001", api_key="test-key" 183 | ) 184 | 185 | assert "Pembrolizumab" in result 186 | assert "PD-1 inhibitor" in result 187 | mock_get.assert_called_once_with( 188 | intervention_id="INT001", 189 | api_key="test-key", 190 | ) 191 | 192 | 193 | class TestToolsWithoutAPIKey: 194 | """Test tools handle missing API key gracefully.""" 195 | 196 | @pytest.mark.asyncio 197 | async def test_organization_searcher_no_api_key(self): 198 | """Test organization searcher without API key.""" 199 | from biomcp.integrations.cts_api import CTSAPIError 200 | 201 | with patch("biomcp.organizations.search_organizations") as mock_search: 202 | mock_search.side_effect = CTSAPIError("NCI API key required") 203 | 204 | with pytest.raises(CTSAPIError, match="NCI API key required"): 205 | await nci_organization_searcher(name="Cancer Center") 206 | 207 | @pytest.mark.asyncio 208 | async def test_intervention_searcher_no_api_key(self): 209 | """Test intervention searcher without API key.""" 210 | from biomcp.integrations.cts_api import CTSAPIError 211 | 212 | with patch("biomcp.interventions.search_interventions") as mock_search: 213 | mock_search.side_effect = CTSAPIError("NCI API key required") 214 | 215 | with pytest.raises(CTSAPIError, match="NCI API key required"): 216 | await nci_intervention_searcher(name="pembrolizumab") 217 | 218 | 219 | class TestElasticsearchErrorHandling: 220 | """Test handling of Elasticsearch bucket limit errors.""" 221 | 222 | @pytest.mark.asyncio 223 | async def test_organization_searcher_elasticsearch_error(self): 224 | """Test organization searcher handles Elasticsearch bucket limit error gracefully.""" 225 | from biomcp.integrations.cts_api import CTSAPIError 226 | 227 | error_response = { 228 | "status": 503, 229 | "detail": [ 230 | 503, 231 | "search_phase_execution_exception", 232 | { 233 | "error": { 234 | "caused_by": { 235 | "type": "too_many_buckets_exception", 236 | "reason": "Trying to create too many buckets. Must be less than or equal to: [75000] but was [75001].", 237 | } 238 | } 239 | }, 240 | ], 241 | } 242 | 243 | with patch("biomcp.organizations.search_organizations") as mock_search: 244 | mock_search.side_effect = CTSAPIError(str(error_response)) 245 | 246 | result = await nci_organization_searcher( 247 | city="Cleveland", api_key="test-key" 248 | ) 249 | 250 | assert "Search Too Broad" in result 251 | assert "city AND state together" in result 252 | assert "city='Cleveland', state='OH'" in result 253 | 254 | @pytest.mark.asyncio 255 | async def test_intervention_searcher_elasticsearch_error(self): 256 | """Test intervention searcher handles Elasticsearch bucket limit error gracefully.""" 257 | from biomcp.integrations.cts_api import CTSAPIError 258 | 259 | error_response = { 260 | "status": 503, 261 | "detail": "too_many_buckets_exception: Trying to create too many buckets. Must be less than or equal to: [75000]", 262 | } 263 | 264 | with patch("biomcp.interventions.search_interventions") as mock_search: 265 | mock_search.side_effect = CTSAPIError(str(error_response)) 266 | 267 | result = await nci_intervention_searcher( 268 | intervention_type="Drug", api_key="test-key" 269 | ) 270 | 271 | assert "Search Too Broad" in result 272 | assert "pembrolizumab" in result 273 | assert "CAR-T" in result 274 | 275 | 276 | class TestBiomarkerTools: 277 | """Test biomarker MCP tools.""" 278 | 279 | @pytest.mark.asyncio 280 | async def test_biomarker_searcher_tool(self): 281 | """Test biomarker searcher MCP tool.""" 282 | from biomcp.individual_tools import nci_biomarker_searcher 283 | 284 | mock_results = { 285 | "total": 2, 286 | "biomarkers": [ 287 | { 288 | "id": "BIO001", 289 | "name": "PD-L1 Expression", 290 | "gene": "CD274", 291 | "type": "expression", 292 | "assay_type": "IHC", 293 | }, 294 | { 295 | "id": "BIO002", 296 | "name": "EGFR Mutation", 297 | "gene": "EGFR", 298 | "type": "mutation", 299 | "assay_type": "NGS", 300 | }, 301 | ], 302 | } 303 | 304 | with ( 305 | patch("biomcp.biomarkers.search_biomarkers") as mock_search, 306 | patch( 307 | "biomcp.biomarkers.search.format_biomarker_results" 308 | ) as mock_format, 309 | ): 310 | mock_search.return_value = mock_results 311 | mock_format.return_value = ( 312 | "## Biomarker Search Results (2 found)\n\nFound 2 biomarkers" 313 | ) 314 | 315 | result = await nci_biomarker_searcher( 316 | name="PD-L1", api_key="test-key" 317 | ) 318 | 319 | assert "Found 2 biomarkers" in result 320 | mock_search.assert_called_once_with( 321 | name="PD-L1", 322 | biomarker_type=None, 323 | page_size=20, 324 | page=1, 325 | api_key="test-key", 326 | ) 327 | 328 | 329 | class TestNCIDiseaseTools: 330 | """Test NCI disease MCP tools.""" 331 | 332 | @pytest.mark.asyncio 333 | async def test_nci_disease_searcher_tool(self): 334 | """Test NCI disease searcher MCP tool.""" 335 | from biomcp.individual_tools import nci_disease_searcher 336 | 337 | mock_results = { 338 | "total": 2, 339 | "diseases": [ 340 | { 341 | "id": "C4872", 342 | "name": "Breast Cancer", 343 | "synonyms": ["Breast Carcinoma", "Mammary Cancer"], 344 | "category": "maintype", 345 | }, 346 | { 347 | "id": "C3790", 348 | "name": "Melanoma", 349 | "synonyms": ["Malignant Melanoma"], 350 | "category": "maintype", 351 | }, 352 | ], 353 | } 354 | 355 | with ( 356 | patch("biomcp.diseases.search_diseases") as mock_search, 357 | patch( 358 | "biomcp.diseases.search.format_disease_results" 359 | ) as mock_format, 360 | ): 361 | mock_search.return_value = mock_results 362 | mock_format.return_value = ( 363 | "## Disease Search Results (2 found)\n\nFound 2 diseases" 364 | ) 365 | 366 | result = await nci_disease_searcher( 367 | name="cancer", include_synonyms=True, api_key="test-key" 368 | ) 369 | 370 | assert "Found 2 diseases" in result 371 | mock_search.assert_called_once_with( 372 | name="cancer", 373 | include_synonyms=True, 374 | category=None, 375 | page_size=20, 376 | page=1, 377 | api_key="test-key", 378 | ) 379 | 380 | 381 | class TestToolsIntegration: 382 | """Test MCP tools integration with actual modules.""" 383 | 384 | @pytest.mark.asyncio 385 | async def test_organization_searcher_imports_work(self): 386 | """Test that organization searcher imports work correctly.""" 387 | # This test verifies the dynamic imports in the tool function work 388 | with ( 389 | patch("biomcp.organizations.search_organizations") as mock_search, 390 | patch( 391 | "biomcp.organizations.search.format_organization_results" 392 | ) as mock_format, 393 | ): 394 | mock_search.return_value = {"total": 0, "organizations": []} 395 | mock_format.return_value = "No organizations found" 396 | 397 | result = await nci_organization_searcher( 398 | name="Nonexistent", api_key="test-key" 399 | ) 400 | 401 | assert result == "No organizations found" 402 | 403 | @pytest.mark.asyncio 404 | async def test_intervention_searcher_imports_work(self): 405 | """Test that intervention searcher imports work correctly.""" 406 | # This test verifies the dynamic imports in the tool function work 407 | with ( 408 | patch("biomcp.interventions.search_interventions") as mock_search, 409 | patch( 410 | "biomcp.interventions.search.format_intervention_results" 411 | ) as mock_format, 412 | ): 413 | mock_search.return_value = {"total": 0, "interventions": []} 414 | mock_format.return_value = "No interventions found" 415 | 416 | result = await nci_intervention_searcher( 417 | name="Nonexistent", api_key="test-key" 418 | ) 419 | 420 | assert result == "No interventions found" 421 | ``` -------------------------------------------------------------------------------- /tests/tdd/openfda/test_security.py: -------------------------------------------------------------------------------- ```python 1 | """Security tests for OpenFDA integration.""" 2 | 3 | import asyncio 4 | import hashlib 5 | import json 6 | from unittest.mock import patch 7 | 8 | import pytest 9 | 10 | from biomcp.openfda.cache import _generate_cache_key 11 | from biomcp.openfda.input_validation import ( 12 | build_safe_query, 13 | sanitize_input, 14 | validate_api_key, 15 | validate_date, 16 | validate_drug_name, 17 | ) 18 | from biomcp.openfda.rate_limiter import ( 19 | CircuitBreaker, 20 | CircuitState, 21 | RateLimiter, 22 | ) 23 | 24 | 25 | class TestInputValidation: 26 | """Test input validation and sanitization.""" 27 | 28 | def test_sanitize_input_removes_injection_chars(self): 29 | """Test that dangerous characters are removed.""" 30 | dangerous = "test<script>alert('xss')</script>" 31 | result = sanitize_input(dangerous) 32 | assert "<script>" not in result 33 | assert "alert" in result # Text preserved 34 | assert "'" not in result # Quotes removed 35 | 36 | def test_sanitize_input_truncates_long_input(self): 37 | """Test that overly long input is truncated.""" 38 | long_input = "a" * 1000 39 | result = sanitize_input(long_input, max_length=100) 40 | assert len(result) == 100 41 | 42 | def test_validate_drug_name_rejects_special_chars(self): 43 | """Test drug name validation.""" 44 | assert validate_drug_name("Aspirin") == "Aspirin" 45 | assert validate_drug_name("Tylenol-500") == "Tylenol-500" 46 | assert validate_drug_name("Drug/Combo") == "Drug/Combo" 47 | # Special chars are removed, not rejected entirely 48 | assert validate_drug_name("Drug<script>") == "Drugscript" 49 | assert ( 50 | validate_drug_name("'; DROP TABLE;") == "DROP TABLE" 51 | ) # SQL chars removed 52 | 53 | def test_validate_date_format(self): 54 | """Test date validation.""" 55 | assert validate_date("2024-01-15") == "2024-01-15" 56 | assert validate_date("2024-13-01") is None # Invalid month 57 | assert validate_date("2024-01-32") is None # Invalid day 58 | assert validate_date("24-01-15") is None # Wrong format 59 | assert validate_date("2024/01/15") is None # Wrong separator 60 | 61 | def test_validate_api_key(self): 62 | """Test API key validation.""" 63 | assert validate_api_key("abc123def456") == "abc123def456" 64 | assert validate_api_key("key-with-hyphens") == "key-with-hyphens" 65 | assert ( 66 | validate_api_key("key_with_underscores") == "key_with_underscores" 67 | ) 68 | assert validate_api_key("key with spaces") is None 69 | assert validate_api_key("key<script>") is None 70 | assert validate_api_key("a" * 101) is None # Too long 71 | assert validate_api_key("short") is None # Too short 72 | 73 | def test_build_safe_query(self): 74 | """Test query parameter sanitization.""" 75 | unsafe_params = { 76 | "drug": "Aspirin<script>", 77 | "limit": "100; DROP TABLE", 78 | "api_key": "secret123456", # Make it valid length 79 | "date": "2024-01-15", 80 | "invalid_key!": "value", 81 | } 82 | 83 | safe = build_safe_query(unsafe_params) 84 | 85 | # Check sanitization 86 | assert safe["drug"] == "Aspirinscript" # Script tags removed 87 | assert safe["limit"] == 25 # Invalid input returns default 88 | assert safe["api_key"] == "secret123456" # Preserved if valid 89 | assert safe["date"] == "2024-01-15" # Valid date preserved 90 | assert "invalid_key!" not in safe # Invalid key removed 91 | 92 | 93 | class TestCacheSecurity: 94 | """Test cache security measures.""" 95 | 96 | def test_api_key_not_in_cache_key(self): 97 | """Test that API keys are not included in cache keys.""" 98 | params = { 99 | "drug": "aspirin", 100 | "limit": 10, 101 | "api_key": "super_secret_key_123", 102 | "apikey": "another_secret", 103 | "token": "bearer_token", 104 | } 105 | 106 | cache_key = _generate_cache_key( 107 | "https://api.fda.gov/drug/event.json", params 108 | ) 109 | 110 | # Verify key is a hash 111 | assert len(cache_key) == 64 # SHA256 hex length 112 | 113 | # Verify sensitive params not in key generation 114 | # Reconstruct what should be hashed 115 | safe_params = {"drug": "aspirin", "limit": 10} 116 | expected_input = f"https://api.fda.gov/drug/event.json:{json.dumps(safe_params, sort_keys=True)}" 117 | expected_hash = hashlib.sha256(expected_input.encode()).hexdigest() 118 | 119 | assert cache_key == expected_hash 120 | 121 | def test_cache_response_size_limit(self): 122 | """Test that overly large responses are not cached.""" 123 | from biomcp.openfda.cache import ( 124 | clear_cache, 125 | get_cached_response, 126 | set_cached_response, 127 | ) 128 | 129 | # Clear cache first 130 | clear_cache() 131 | 132 | # Create a response that's WAY too large (use a huge list) 133 | # sys.getsizeof doesn't accurately measure nested structures 134 | # So we need to make it really big 135 | large_response = {"data": ["x" * 100000 for _ in range(1000)]} 136 | 137 | # Try to cache it 138 | set_cached_response( 139 | "https://api.fda.gov/test", {"drug": "test"}, large_response 140 | ) 141 | 142 | # Verify it wasn't cached 143 | cached = get_cached_response( 144 | "https://api.fda.gov/test", {"drug": "test"} 145 | ) 146 | assert cached is None 147 | 148 | 149 | class TestRateLimiting: 150 | """Test rate limiting and circuit breaker.""" 151 | 152 | @pytest.mark.asyncio 153 | async def test_rate_limiter_blocks_excessive_requests(self): 154 | """Test that rate limiter blocks when limit exceeded.""" 155 | limiter = RateLimiter(rate=2, per=1.0) # 2 requests per second 156 | 157 | start = asyncio.get_event_loop().time() 158 | 159 | # First two should be immediate 160 | await limiter.acquire() 161 | await limiter.acquire() 162 | 163 | # Third should be delayed 164 | await limiter.acquire() 165 | 166 | elapsed = asyncio.get_event_loop().time() - start 167 | 168 | # Should have taken at least 0.5 seconds (waiting for token) 169 | assert elapsed >= 0.4 # Allow some margin 170 | 171 | @pytest.mark.asyncio 172 | async def test_circuit_breaker_opens_on_failures(self): 173 | """Test that circuit breaker opens after threshold failures.""" 174 | breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=1) 175 | 176 | async def failing_func(): 177 | raise Exception("API Error") 178 | 179 | # First 3 failures should work but increment counter 180 | for _i in range(3): 181 | with pytest.raises(Exception, match="API Error"): 182 | await breaker.call(failing_func) 183 | 184 | # Circuit should now be open 185 | assert breaker.is_open 186 | assert breaker.state == CircuitState.OPEN 187 | 188 | # Next call should be rejected by circuit breaker 189 | with pytest.raises(Exception) as exc_info: 190 | await breaker.call(failing_func) 191 | assert "Circuit breaker is OPEN" in str(exc_info.value) 192 | 193 | @pytest.mark.asyncio 194 | async def test_circuit_breaker_recovers(self): 195 | """Test that circuit breaker recovers after timeout.""" 196 | breaker = CircuitBreaker(failure_threshold=2, recovery_timeout=0.1) 197 | 198 | call_count = 0 199 | 200 | async def intermittent_func(): 201 | nonlocal call_count 202 | call_count += 1 203 | if call_count <= 2: 204 | raise Exception("API Error") 205 | return "Success" 206 | 207 | # Trigger circuit to open 208 | for _i in range(2): 209 | with pytest.raises(Exception, match="API Error"): 210 | await breaker.call(intermittent_func) 211 | 212 | assert breaker.is_open 213 | 214 | # Wait for recovery timeout 215 | await asyncio.sleep(0.15) 216 | 217 | # Should enter half-open and succeed 218 | result = await breaker.call(intermittent_func) 219 | assert result == "Success" 220 | 221 | # Circuit should be closed again 222 | assert breaker.is_closed 223 | 224 | 225 | class TestSecurityIntegration: 226 | """Integration tests for security features.""" 227 | 228 | @pytest.mark.asyncio 229 | async def test_sql_injection_prevention(self): 230 | """Test that SQL injection attempts are sanitized.""" 231 | from biomcp.openfda.utils import make_openfda_request 232 | 233 | with patch("biomcp.openfda.utils.request_api") as mock_request: 234 | mock_request.return_value = ({"results": []}, None) 235 | 236 | # Attempt SQL injection through the utils layer 237 | # This tests the actual sanitization at the request level 238 | _, error = await make_openfda_request( 239 | "https://api.fda.gov/drug/event.json", 240 | {"search": "drug:'; DROP TABLE users; --", "limit": 10}, 241 | ) 242 | 243 | # Request should succeed (no error) 244 | assert error is None 245 | 246 | # Check that input was sanitized before reaching API 247 | call_args = mock_request.call_args 248 | if call_args: 249 | params = call_args[1]["request"] # Get request params 250 | # Dangerous chars should be removed by sanitization 251 | assert "';" not in str(params.get("search", "")) 252 | assert "--" not in str(params.get("search", "")) 253 | 254 | @pytest.mark.asyncio 255 | async def test_xss_prevention(self): 256 | """Test that XSS attempts are sanitized.""" 257 | from biomcp.openfda.drug_labels import search_drug_labels 258 | 259 | with patch( 260 | "biomcp.openfda.drug_labels.make_openfda_request" 261 | ) as mock_request: 262 | mock_request.return_value = ({"results": []}, None) 263 | 264 | # Attempt XSS (use correct parameter name) 265 | await search_drug_labels( 266 | name="<script>alert('xss')</script>", limit=10 267 | ) 268 | 269 | # Check that the dangerous input was sanitized 270 | call_args = mock_request.call_args 271 | if call_args: 272 | params = call_args[0][1] 273 | # Script tags should be removed 274 | assert "<script>" not in str(params) 275 | 276 | @pytest.mark.asyncio 277 | async def test_command_injection_prevention(self): 278 | """Test that command injection attempts are blocked.""" 279 | from biomcp.openfda.device_events import search_device_events 280 | 281 | with patch( 282 | "biomcp.openfda.device_events.make_openfda_request" 283 | ) as mock_request: 284 | mock_request.return_value = ({"results": []}, None) 285 | 286 | # Attempt command injection 287 | await search_device_events(device="pump; rm -rf /", limit=10) 288 | 289 | # Check that dangerous characters were removed 290 | call_args = mock_request.call_args 291 | if call_args: 292 | params = call_args[0][1] 293 | str(params.get("search", "")) 294 | # Semicolons might be in the search string for other reasons 295 | # But the actual shell commands should be intact as text 296 | # This is OK because FDA API doesn't execute commands 297 | # The important thing is input validation at the utils level 298 | assert call_args is not None # Just verify the call was made 299 | 300 | def test_api_key_not_logged(self): 301 | """Test that API keys are not logged.""" 302 | import logging 303 | 304 | from biomcp.openfda.utils import get_api_key 305 | 306 | # Set up log capture 307 | with patch.object( 308 | logging.getLogger("biomcp.openfda.utils"), "debug" 309 | ) as mock_debug: 310 | # Call function that might log 311 | key = get_api_key() 312 | 313 | # Check logs don't contain actual key 314 | for call in mock_debug.call_args_list: 315 | log_message = str(call) 316 | # Should not contain actual API key values 317 | assert "secret" not in log_message.lower() 318 | if key: 319 | assert key not in log_message 320 | 321 | @pytest.mark.asyncio 322 | async def test_rate_limit_applied_to_requests(self): 323 | """Test that rate limiting is applied to actual requests.""" 324 | from biomcp.openfda.utils import make_openfda_request 325 | 326 | with patch("biomcp.openfda.utils.request_api") as mock_api: 327 | mock_api.return_value = ({"results": []}, None) 328 | 329 | # Make rapid requests 330 | asyncio.get_event_loop().time() 331 | 332 | tasks = [] 333 | for i in range(3): 334 | task = make_openfda_request( 335 | "https://api.fda.gov/test", {"drug": f"test{i}"} 336 | ) 337 | tasks.append(task) 338 | 339 | # Should be rate limited 340 | results = await asyncio.gather(*tasks) 341 | 342 | # All should succeed 343 | for _result, error in results: 344 | assert error is None or "circuit breaker" not in error.lower() 345 | 346 | 347 | class TestFileOperationSecurity: 348 | """Test file operation security.""" 349 | 350 | def test_cache_file_permissions(self): 351 | """Test that cache files are created with secure permissions.""" 352 | import stat 353 | 354 | from biomcp.openfda.drug_shortages import CACHE_DIR 355 | 356 | # Ensure directory exists 357 | CACHE_DIR.mkdir(parents=True, exist_ok=True) 358 | 359 | # Create a test file 360 | test_file = CACHE_DIR / "test_permissions.json" 361 | test_file.write_text("{}") 362 | 363 | # Check permissions (should not be world-writable) 364 | file_stat = test_file.stat() 365 | mode = file_stat.st_mode 366 | 367 | # Check that others don't have write permission 368 | assert not (mode & stat.S_IWOTH) 369 | 370 | # Clean up 371 | test_file.unlink() 372 | 373 | @pytest.mark.asyncio 374 | async def test_atomic_file_operations(self): 375 | """Test that file operations are atomic.""" 376 | 377 | from biomcp.openfda.drug_shortages import _get_cached_shortage_data 378 | 379 | # This should use atomic operations internally 380 | with patch( 381 | "biomcp.openfda.drug_shortages._fetch_shortage_data" 382 | ) as mock_fetch: 383 | mock_fetch.return_value = { 384 | "test": "data", 385 | "_fetched_at": "2024-01-01T00:00:00", 386 | } 387 | 388 | # Should handle concurrent access gracefully 389 | tasks = [] 390 | for _i in range(5): 391 | task = _get_cached_shortage_data() 392 | tasks.append(task) 393 | 394 | results = await asyncio.gather(*tasks, return_exceptions=True) 395 | 396 | # All should succeed or return same cached data 397 | for result in results: 398 | if not isinstance(result, Exception): 399 | assert result is None or isinstance(result, dict) 400 | ``` -------------------------------------------------------------------------------- /src/biomcp/variants/cbio_external_client.py: -------------------------------------------------------------------------------- ```python 1 | """Refactored cBioPortal client for external variant aggregator using centralized HTTP.""" 2 | 3 | import asyncio 4 | import logging 5 | import re 6 | from typing import Any 7 | 8 | from pydantic import BaseModel, Field 9 | 10 | from ..utils.cbio_http_adapter import CBioHTTPAdapter 11 | from .cancer_types import MAX_STUDIES_PER_GENE, get_cancer_keywords 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class CBioPortalVariantData(BaseModel): 17 | """cBioPortal variant annotation data.""" 18 | 19 | total_cases: int | None = Field( 20 | None, description="Total number of cases with this variant" 21 | ) 22 | studies: list[str] = Field( 23 | default_factory=list, 24 | description="List of studies containing this variant", 25 | ) 26 | cancer_type_distribution: dict[str, int] = Field( 27 | default_factory=dict, 28 | description="Distribution of mutation across cancer types", 29 | ) 30 | mutation_types: dict[str, int] = Field( 31 | default_factory=dict, 32 | description="Distribution of mutation types (missense, nonsense, etc)", 33 | ) 34 | hotspot_count: int = Field( 35 | 0, description="Number of samples where this is a known hotspot" 36 | ) 37 | mean_vaf: float | None = Field( 38 | None, description="Mean variant allele frequency across samples" 39 | ) 40 | sample_types: dict[str, int] = Field( 41 | default_factory=dict, 42 | description="Distribution across sample types (primary, metastatic)", 43 | ) 44 | 45 | 46 | class CBioPortalExternalClient: 47 | """Refactored cBioPortal client using centralized HTTP.""" 48 | 49 | def __init__(self) -> None: 50 | self.http_adapter = CBioHTTPAdapter() 51 | self._study_cache: dict[str, dict[str, Any]] = {} 52 | 53 | async def get_variant_data( 54 | self, gene_aa: str 55 | ) -> CBioPortalVariantData | None: 56 | """Fetch variant data from cBioPortal. 57 | 58 | Args: 59 | gene_aa: Gene and AA change format (e.g., "BRAF V600E") 60 | """ 61 | logger.info( 62 | f"CBioPortalExternalClient.get_variant_data called with: {gene_aa}" 63 | ) 64 | try: 65 | # Split gene and AA change 66 | parts = gene_aa.split(" ", 1) 67 | if len(parts) != 2: 68 | logger.warning(f"Invalid gene_aa format: {gene_aa}") 69 | return None 70 | 71 | gene, aa_change = parts 72 | logger.info(f"Extracted gene={gene}, aa_change={aa_change}") 73 | 74 | # Get gene ID 75 | gene_id = await self._get_gene_id(gene) 76 | if not gene_id: 77 | return None 78 | 79 | # Get relevant mutation profiles 80 | mutation_profiles = await self._get_mutation_profiles(gene) 81 | if not mutation_profiles: 82 | logger.info(f"No relevant mutation profiles found for {gene}") 83 | return CBioPortalVariantData() 84 | 85 | # Fetch mutations 86 | mutations_data = await self._fetch_mutations( 87 | gene_id, mutation_profiles 88 | ) 89 | if not mutations_data: 90 | return CBioPortalVariantData() 91 | 92 | # Filter mutations by AA change 93 | matching_mutations = self._filter_mutations_by_aa_change( 94 | mutations_data, aa_change 95 | ) 96 | if not matching_mutations: 97 | return None 98 | 99 | # Aggregate mutation data 100 | return await self._aggregate_mutation_data(matching_mutations) 101 | 102 | except Exception as e: 103 | logger.error( 104 | f"Error getting cBioPortal data for {gene_aa}: {type(e).__name__}: {e}" 105 | ) 106 | return None 107 | 108 | async def _get_gene_id(self, gene: str) -> int | None: 109 | """Get Entrez gene ID from gene symbol. 110 | 111 | Args: 112 | gene: Gene symbol (e.g., "BRAF") 113 | 114 | Returns: 115 | Entrez gene ID if found, None otherwise 116 | """ 117 | gene_data, gene_error = await self.http_adapter.get( 118 | f"/genes/{gene}", 119 | endpoint_key="cbioportal_genes", 120 | cache_ttl=3600, # 1 hour 121 | ) 122 | 123 | if gene_error or not gene_data: 124 | logger.warning(f"Failed to fetch gene info for {gene}") 125 | return None 126 | 127 | gene_id = gene_data.get("entrezGeneId") 128 | if not gene_id: 129 | logger.warning(f"No entrezGeneId in gene response: {gene_data}") 130 | return None 131 | 132 | logger.info(f"Got entrezGeneId: {gene_id}") 133 | return gene_id 134 | 135 | async def _get_mutation_profiles(self, gene: str) -> list[dict[str, Any]]: 136 | """Get relevant mutation profiles for a gene. 137 | 138 | Args: 139 | gene: Gene symbol to find profiles for 140 | 141 | Returns: 142 | List of mutation profile dictionaries filtered by cancer relevance 143 | """ 144 | profiles, prof_error = await self.http_adapter.get( 145 | "/molecular-profiles", 146 | endpoint_key="cbioportal_molecular_profiles", 147 | cache_ttl=3600, # 1 hour 148 | ) 149 | 150 | if prof_error or not profiles: 151 | logger.warning("Failed to fetch molecular profiles") 152 | return [] 153 | 154 | # Get cancer keywords from configuration 155 | cancer_keywords = get_cancer_keywords(gene) 156 | 157 | # Collect mutation profiles to query 158 | mutation_profiles: list[dict[str, Any]] = [] 159 | if not isinstance(profiles, list): 160 | return [] 161 | 162 | for p in profiles: 163 | if ( 164 | isinstance(p, dict) 165 | and p.get("molecularAlterationType") == "MUTATION_EXTENDED" 166 | ): 167 | study_id = p.get("studyId", "").lower() 168 | if any(keyword in study_id for keyword in cancer_keywords): 169 | mutation_profiles.append(p) 170 | if len(mutation_profiles) >= MAX_STUDIES_PER_GENE: 171 | break 172 | 173 | logger.info( 174 | f"Found {len(mutation_profiles)} relevant mutation profiles" 175 | ) 176 | return mutation_profiles 177 | 178 | async def _fetch_mutations( 179 | self, gene_id: int, mutation_profiles: list[dict[str, Any]] 180 | ) -> list[dict[str, Any]]: 181 | """Fetch mutations for a gene from mutation profiles. 182 | 183 | Args: 184 | gene_id: Entrez gene ID 185 | mutation_profiles: List of molecular profile dictionaries 186 | 187 | Returns: 188 | List of mutation records from cBioPortal 189 | """ 190 | profile_ids = [p["molecularProfileId"] for p in mutation_profiles] 191 | logger.info(f"Querying {len(profile_ids)} profiles for mutations") 192 | 193 | mutations_data, mut_error = await self.http_adapter.post( 194 | "/mutations/fetch", 195 | data={ 196 | "entrezGeneIds": [gene_id], 197 | "molecularProfileIds": profile_ids, 198 | }, 199 | endpoint_key="cbioportal_mutations", 200 | cache_ttl=1800, # 30 minutes 201 | ) 202 | 203 | if mut_error or not mutations_data: 204 | logger.warning(f"Failed to fetch mutations: {mut_error}") 205 | return [] 206 | 207 | if not isinstance(mutations_data, list): 208 | return [] 209 | 210 | return mutations_data 211 | 212 | def _filter_mutations_by_aa_change( 213 | self, mutations_data: list[dict[str, Any]], aa_change: str 214 | ) -> list[dict[str, Any]]: 215 | """Filter mutations by amino acid change. 216 | 217 | Args: 218 | mutations_data: List of mutation records from cBioPortal 219 | aa_change: Amino acid change notation (e.g., "V600E") 220 | 221 | Returns: 222 | Filtered list containing only mutations matching the AA change 223 | """ 224 | matching_mutations = [] 225 | aa_patterns = self._get_aa_patterns(aa_change) 226 | 227 | for mut in mutations_data: 228 | protein_change = mut.get("proteinChange", "") 229 | if any(pattern.match(protein_change) for pattern in aa_patterns): 230 | matching_mutations.append(mut) 231 | 232 | logger.info(f"Found {len(matching_mutations)} matching mutations") 233 | return matching_mutations 234 | 235 | async def _aggregate_mutation_data( 236 | self, matching_mutations: list[dict[str, Any]] 237 | ) -> CBioPortalVariantData: 238 | """Aggregate mutation data into summary statistics. 239 | 240 | Args: 241 | matching_mutations: List of mutations matching the query criteria 242 | 243 | Returns: 244 | Aggregated variant data with statistics across all samples 245 | """ 246 | # Get unique study IDs 247 | study_ids = list({ 248 | mut.get("studyId", "") 249 | for mut in matching_mutations 250 | if mut.get("studyId") 251 | }) 252 | 253 | # Fetch study metadata in parallel 254 | study_cancer_types = await self._fetch_study_metadata_parallel( 255 | study_ids 256 | ) 257 | 258 | # Aggregate data 259 | sample_ids: set[str] = set() 260 | cancer_type_dist: dict[str, int] = {} 261 | mutation_type_dist: dict[str, int] = {} 262 | vaf_values: list[float] = [] 263 | sample_type_dist: dict[str, int] = {} 264 | 265 | for mut in matching_mutations: 266 | # Count samples 267 | sample_id = mut.get("sampleId") 268 | if sample_id: 269 | sample_ids.add(sample_id) 270 | 271 | # Count cancer types 272 | study_id = mut.get("studyId", "") 273 | if study_id in study_cancer_types: 274 | cancer_type = study_cancer_types[study_id] 275 | cancer_type_dist[cancer_type] = ( 276 | cancer_type_dist.get(cancer_type, 0) + 1 277 | ) 278 | 279 | # Count mutation types 280 | mut_type = mut.get("mutationType", "Unknown") 281 | mutation_type_dist[mut_type] = ( 282 | mutation_type_dist.get(mut_type, 0) + 1 283 | ) 284 | 285 | # Calculate VAF if data available 286 | tumor_alt = mut.get("tumorAltCount") 287 | tumor_ref = mut.get("tumorRefCount") 288 | if ( 289 | tumor_alt is not None 290 | and tumor_ref is not None 291 | and (tumor_alt + tumor_ref) > 0 292 | ): 293 | vaf = tumor_alt / (tumor_alt + tumor_ref) 294 | vaf_values.append(vaf) 295 | 296 | # Count sample types 297 | sample_type = mut.get("sampleType", "Unknown") 298 | sample_type_dist[sample_type] = ( 299 | sample_type_dist.get(sample_type, 0) + 1 300 | ) 301 | 302 | # Calculate mean VAF 303 | mean_vaf = None 304 | if vaf_values: 305 | mean_vaf = round(sum(vaf_values) / len(vaf_values), 3) 306 | 307 | # Check for hotspots (simplified - just check if it's a common mutation) 308 | hotspot_count = ( 309 | len(matching_mutations) if len(matching_mutations) > 10 else 0 310 | ) 311 | 312 | return CBioPortalVariantData( 313 | total_cases=len(sample_ids), 314 | studies=sorted(study_ids)[:10], # Top 10 studies 315 | cancer_type_distribution=cancer_type_dist, 316 | mutation_types=mutation_type_dist, 317 | hotspot_count=hotspot_count, 318 | mean_vaf=mean_vaf, 319 | sample_types=sample_type_dist, 320 | ) 321 | 322 | def _get_aa_patterns(self, aa_change: str) -> list[re.Pattern]: 323 | """Get regex patterns to match amino acid changes. 324 | 325 | Handles various notation formats: 326 | - Direct match (e.g., "V600E") 327 | - With p. prefix (e.g., "p.V600E") 328 | - Position wildcards (e.g., "V600*" matches V600E, V600K, etc.) 329 | 330 | Args: 331 | aa_change: Amino acid change notation 332 | 333 | Returns: 334 | List of compiled regex patterns for flexible matching 335 | """ 336 | patterns = [] 337 | 338 | # Direct match 339 | patterns.append(re.compile(re.escape(aa_change))) 340 | 341 | # Handle p. prefix 342 | if not aa_change.startswith("p."): 343 | patterns.append(re.compile(f"p\\.{re.escape(aa_change)}")) 344 | else: 345 | # Also try without p. 346 | patterns.append(re.compile(re.escape(aa_change[2:]))) 347 | 348 | # Handle special cases like V600E/V600K 349 | base_match = re.match(r"([A-Z])(\d+)([A-Z])", aa_change) 350 | if base_match: 351 | ref_aa, position, _ = base_match.groups() 352 | # Match any mutation at this position 353 | patterns.append(re.compile(f"p?\\.?{ref_aa}{position}[A-Z]")) 354 | 355 | return patterns 356 | 357 | async def _fetch_study_metadata_parallel( 358 | self, study_ids: list[str] 359 | ) -> dict[str, str]: 360 | """Fetch study metadata in parallel for cancer type information. 361 | 362 | Args: 363 | study_ids: List of study IDs to fetch 364 | 365 | Returns: 366 | Dict mapping study ID to cancer type name 367 | """ 368 | # Check cache first 369 | study_cancer_types = {} 370 | uncached_ids = [] 371 | 372 | for study_id in study_ids: 373 | if study_id in self._study_cache: 374 | study_data = self._study_cache[study_id] 375 | cancer_type = study_data.get("cancerType", {}) 376 | study_cancer_types[study_id] = cancer_type.get( 377 | "name", "Unknown" 378 | ) 379 | else: 380 | uncached_ids.append(study_id) 381 | 382 | if uncached_ids: 383 | # Fetch uncached studies in parallel 384 | tasks = [] 385 | for study_id in uncached_ids[:10]: # Limit parallel requests 386 | tasks.append(self._fetch_single_study(study_id)) 387 | 388 | results = await asyncio.gather(*tasks, return_exceptions=True) 389 | 390 | for study_id, result in zip( 391 | uncached_ids[:10], results, strict=False 392 | ): 393 | if isinstance(result, Exception): 394 | logger.debug( 395 | f"Failed to fetch study {study_id}: {type(result).__name__}" 396 | ) 397 | study_cancer_types[study_id] = "Unknown" 398 | elif isinstance(result, dict): 399 | # Cache the study data 400 | self._study_cache[study_id] = result 401 | cancer_type = result.get("cancerType", {}) 402 | study_cancer_types[study_id] = cancer_type.get( 403 | "name", "Unknown" 404 | ) 405 | else: 406 | study_cancer_types[study_id] = "Unknown" 407 | 408 | return study_cancer_types 409 | 410 | async def _fetch_single_study( 411 | self, study_id: str 412 | ) -> dict[str, Any] | None: 413 | """Fetch metadata for a single study.""" 414 | study_data, error = await self.http_adapter.get( 415 | f"/studies/{study_id}", 416 | endpoint_key="cbioportal_studies", 417 | cache_ttl=3600, # 1 hour 418 | ) 419 | 420 | if error or not study_data: 421 | logger.debug(f"Failed to fetch study {study_id}: {error}") 422 | return None 423 | 424 | return study_data 425 | ``` -------------------------------------------------------------------------------- /tests/data/myvariant/myvariant_api.yaml: -------------------------------------------------------------------------------- ```yaml 1 | openapi: 3.0.3 2 | info: 3 | contact: 4 | email: [email protected] 5 | name: Chunlei Wu 6 | x-id: https://github.com/newgene 7 | x-role: responsible developer 8 | description: 9 | Documentation of the MyVariant.info genetic variant query web services. 10 | Learn more about [MyVariant.info](https://docs.myvariant.info/en/latest/index.html) 11 | termsOfService: https://myvariant.info/terms/ 12 | title: MyVariant.info API 13 | version: "1.0" 14 | x-translator: 15 | biolink-version: 4.2.2 16 | component: KP 17 | infores: infores:myvariant-info 18 | team: 19 | - Service Provider 20 | servers: 21 | - description: Encrypted Production server 22 | url: https://myvariant.info/v1 23 | x-maturity: production 24 | tags: 25 | - name: variant 26 | - name: query 27 | - name: metadata 28 | - name: translator 29 | - name: biothings 30 | paths: 31 | /metadata: 32 | get: 33 | description: Get metadata about the data available from the API 34 | responses: 35 | "200": 36 | description: 37 | A 200 status code indicates a successful query, and is accompanied 38 | by the query response payload. 39 | tags: 40 | - metadata 41 | /metadata/fields: 42 | get: 43 | description: Get metadata about the data fields available from the API 44 | responses: 45 | "200": 46 | description: 47 | A 200 status code indicates a successful query, and is accompanied 48 | by the query response payload. 49 | tags: 50 | - metadata 51 | /query: 52 | get: 53 | description: 54 | MyChem.info chemical query web service. In the output, "total" 55 | in the output gives the total number of matching hits, while the actual hits 56 | are returned under "hits" field. 57 | parameters: 58 | - description: 59 | Required, passing user query. The detailed query syntax for parameter 60 | is explained [here](https://docs.myvariant.info/en/latest/doc/variant_query_service.html#query-syntax). 61 | example: rs58991260 62 | in: query 63 | name: q 64 | required: true 65 | schema: 66 | type: string 67 | - $ref: "#/components/parameters/fields" 68 | - $ref: "#/components/parameters/size" 69 | - $ref: "#/components/parameters/from" 70 | - $ref: "#/components/parameters/fetch_all" 71 | - $ref: "#/components/parameters/scroll_id" 72 | - $ref: "#/components/parameters/sort" 73 | - $ref: "#/components/parameters/facets" 74 | - $ref: "#/components/parameters/facet_size" 75 | - $ref: "#/components/parameters/callback" 76 | - $ref: "#/components/parameters/dotfield" 77 | - $ref: "#/components/parameters/email" 78 | responses: 79 | "200": 80 | description: 81 | A 200 status code indicates a successful query, and is accompanied 82 | by the query response payload. 83 | tags: 84 | - query 85 | post: 86 | description: 87 | 'Although making simple GET requests above to our variant query 88 | service is sufficient for most use cases, there are times you might find 89 | it more efficient to make batch queries (e.g., retrieving variant annotation for 90 | multiple variants). Fortunately, you can also make batch queries via POST 91 | requests when you need to. 92 | 93 | 94 | The "query" field in the returned object indicates the matching query term. 95 | If a query term has no match, it will return with a "notfound" field with 96 | the value "true".' 97 | parameters: 98 | - description: 99 | "Accepts multiple values separated by commas. Note that currently 100 | we only take the input values up to 1000 maximum, the rest will be omitted. 101 | 102 | 103 | The request body can also be used to provide these ids." 104 | in: query 105 | name: q 106 | required: false 107 | schema: 108 | items: 109 | type: string 110 | type: array 111 | - description: 112 | 'Optional, specify one or more fields (separated by commas) to 113 | search, e.g., "scopes=dbsnp.rsid". The available "fields" can be passed 114 | to "scopes" parameter are listed [here](https://docs.myvariant.info/en/latest/doc/data.html#available-fields). 115 | Default: _id 116 | 117 | 118 | The request body can also be used to provide this information.' 119 | in: query 120 | name: scopes 121 | required: false 122 | schema: 123 | type: string 124 | - $ref: "#/components/parameters/fields" 125 | - $ref: "#/components/parameters/email" 126 | - $ref: "#/components/parameters/size" 127 | - $ref: "#/components/parameters/from" 128 | - $ref: "#/components/parameters/fetch_all" 129 | - $ref: "#/components/parameters/scroll_id" 130 | requestBody: 131 | content: 132 | application/json: 133 | example: 134 | q: 135 | - rs58991260 136 | - rs928128624 137 | scopes: 138 | - dbsnp.rsid 139 | schema: 140 | properties: 141 | q: 142 | description: 143 | Accepts multiple values separated by commas. Note that 144 | currently we only take the input values up to 1000 maximum, the 145 | rest will be omitted. 146 | items: 147 | type: string 148 | type: array 149 | scopes: 150 | description: 151 | 'Specify one or more fields (separated by commas) to 152 | search, e.g., "scopes=dbsnp.rsid". The available "fields" can 153 | be passed to "scopes" parameter are listed [here](https://docs.myvariant.info/en/latest/doc/data.html#available-fields). 154 | Default: _id' 155 | items: 156 | type: string 157 | type: array 158 | type: object 159 | responses: 160 | "200": 161 | description: 162 | A 200 status code indicates a successful query, and is accompanied 163 | by the query response payload. 164 | tags: 165 | - query 166 | /variant: 167 | post: 168 | description: 169 | Although making simple GET requests above to our variant query 170 | service is sufficient in most use cases, there are some times you might find 171 | it easier to batch query (e.g., retrieving variant annotations for multiple 172 | variants). Fortunately, you can also make batch queries via POST requests 173 | when you need to. 174 | parameters: 175 | - description: 176 | 'Required. Accepts multiple HGVS variant ids separated by comma, e.g., 177 | "ids=chr6:g.152708291G>A,chr7:g.55241707G>T,chr16:g.28883241A>G". Note 178 | that currently we only take the input ids up to 1000 maximum, the rest will 179 | be omitted. 180 | 181 | 182 | The request body can also be used to provide these ids.' 183 | in: query 184 | name: ids 185 | required: false 186 | schema: 187 | type: string 188 | - $ref: "#/components/parameters/fields" 189 | - $ref: "#/components/parameters/email" 190 | - $ref: "#/components/parameters/size" 191 | requestBody: 192 | content: 193 | application/json: 194 | example: 195 | ids: 196 | - chr6:g.152708291G>A 197 | - chr7:g.55241707G>T 198 | schema: 199 | properties: 200 | ids: 201 | description: 202 | Accepts multiple variant ids. Note that currently we 203 | only take the input ids up to 1000 maximum, the rest will be 204 | omitted. 205 | items: 206 | type: string 207 | type: array 208 | type: object 209 | responses: 210 | "200": 211 | description: 212 | A 200 status code indicates a successful query, and is accompanied 213 | by the query response payload. 214 | tags: 215 | - variant 216 | /variant/{id}: 217 | get: 218 | description: 219 | 'By default, this will return the complete variant annotation object 220 | in JSON format. See [here](https://docs.myvariant.info/en/latest/doc/variant_annotation_service.html#returned-object) for 221 | an example and [here](https://docs.myvariant.info/en/latest/doc/data.html#variant-object) 222 | for more details. If the input variant ID is not valid, 404 (NOT FOUND) will 223 | be returned. 224 | 225 | 226 | Optionally, you can pass a "fields" parameter to return only the annotation 227 | you want (by filtering returned object fields). "fields" accepts any attributes 228 | (a.k.a fields) available from the object. Multiple attributes should be separated 229 | by commas. If an attribute is not available for a specific variant object, 230 | it will be ignored. Note that the attribute names are case-sensitive. 231 | 232 | 233 | Just like the variant query service, you can also pass a "callback" parameter 234 | to make a JSONP call.' 235 | parameters: 236 | - description: 237 | Retrieve chemical data based on ID - currently the HGVS-based 238 | id using genomic location based on hg19 human genome assembly 239 | example: chr6:g.152708291G>A 240 | in: path 241 | name: id 242 | required: true 243 | schema: 244 | type: string 245 | - $ref: "#/components/parameters/fields" 246 | - $ref: "#/components/parameters/callback" 247 | - $ref: "#/components/parameters/email" 248 | - $ref: "#/components/parameters/size" 249 | responses: 250 | "200": 251 | description: 252 | A 200 status code indicates a successful query, and is accompanied 253 | by the query response payload. 254 | tags: 255 | - variant 256 | components: 257 | parameters: 258 | assembly: 259 | in: query 260 | name: assembly 261 | required: false 262 | schema: 263 | default: hg19 264 | type: string 265 | callback: 266 | description: Optional, you can pass a "callback" parameter to make a JSONP call. 267 | in: query 268 | name: callback 269 | required: false 270 | schema: 271 | type: string 272 | dotfield: 273 | description: 274 | 'Optional, can be used to control the format of the returned object. If 275 | "dotfield" is true, the returned data object is returned flattened (no nested 276 | objects) using dotfield notation for key names. Default: false.' 277 | in: query 278 | name: dotfield 279 | required: false 280 | schema: 281 | default: false 282 | type: boolean 283 | email: 284 | description: 285 | Optional, if you are regular users of our services, we encourage 286 | you to provide us an email, so that we can better track the usage or follow 287 | up with you. 288 | in: query 289 | name: email 290 | required: false 291 | schema: 292 | type: string 293 | facet_size: 294 | description: 295 | Optional, an integer (1 <= facet_size <= 1000) that specifies how 296 | many buckets to return in a [faceted query](https://docs.mychem.info/en/latest/doc/chem_query_service.html?highlight=from#faceted-queries). 297 | in: query 298 | name: facet_size 299 | required: false 300 | schema: 301 | default: 10 302 | type: integer 303 | facets: 304 | description: 305 | Optional, a single field or comma-separated fields to return facets, 306 | can only be used on non-free text fields. E.g. "facets=chembl.molecule_properties.full_mwt". 307 | See [examples of faceted queries here](https://docs.mychem.info/en/latest/doc/chem_query_service.html?highlight=from#faceted-queries). 308 | in: query 309 | name: facets 310 | required: false 311 | schema: 312 | items: 313 | type: string 314 | type: array 315 | fetch_all: 316 | description: 317 | "Optional, a boolean, which when TRUE, allows fast retrieval of 318 | all unsorted query hits. The return object contains a _scroll_id field, which 319 | when passed as a parameter to the query endpoint (see the scroll_id parameter), 320 | returns the next 1000 query results. Setting fetch_all = TRUE causes the 321 | results to be inherently unsorted, therefore the sort parameter is ignored. 322 | For more information, see [examples using fetch_all here](https://docs.mychem.info/en/latest/doc/chem_query_service.html?highlight=from#scrolling-queries). Default: 323 | FALSE." 324 | in: query 325 | name: fetch_all 326 | required: false 327 | schema: 328 | default: false 329 | type: boolean 330 | fields: 331 | description: 332 | "Optional, can be a comma-separated list to limit the fields returned\ 333 | \ from the object. If \"fields=all\", all available fields will be returned.\ 334 | \ Look [here](https://docs.mychem.info/en/latest/doc/data.html#available-fields)\ 335 | \ for a list of available fields. \n\nNote that it supports dot notation as\ 336 | \ well, e.g., you can pass \"chebi.name\". Default: \"fields=all\". The\ 337 | \ parameter \"filter\" is an alias for this parameter." 338 | in: query 339 | name: fields 340 | required: false 341 | schema: 342 | default: all 343 | type: string 344 | from: 345 | description: 346 | "Optional, the number of matching hits to skip, starting from 0. 347 | Default: 0. " 348 | in: query 349 | name: from 350 | required: false 351 | schema: 352 | default: 0 353 | type: integer 354 | scroll_id: 355 | description: 356 | Optional, a string containing the _scroll_id returned from a query 357 | request with fetch_all = TRUE. Supplying a valid scroll_id will return the 358 | next 1000 unordered results. If the next results are not obtained within 359 | 1 minute of the previous set of results, the scroll_id becomes stale, and 360 | a new one must be obtained with another query request with fetch_all = TRUE. 361 | All other parameters are ignored when the scroll_id parameter is supplied. 362 | For more information see [examples using scroll_id here](https://docs.mychem.info/en/latest/doc/chem_query_service.html?highlight=from#scrolling-queries). 363 | in: query 364 | name: scroll_id 365 | required: false 366 | schema: 367 | type: string 368 | size: 369 | description: 370 | 'Optional, the maximum number of matching hits to return (with 371 | a cap of 1000 at the moment). Default: 10. The combination of "size" and "from" 372 | parameters can be used to get paging for a large query.' 373 | in: query 374 | name: size 375 | required: false 376 | schema: 377 | default: 10 378 | type: integer 379 | sort: 380 | description: 381 | 'Optional, the comma-separated fields to sort on. Prefix with "-" 382 | for descending order, otherwise in ascending order. Default: sort by matching 383 | scores in descending order.' 384 | in: query 385 | name: sort 386 | required: false 387 | schema: 388 | items: 389 | type: string 390 | type: array 391 | ``` -------------------------------------------------------------------------------- /src/biomcp/variants/cbioportal_search.py: -------------------------------------------------------------------------------- ```python 1 | """cBioPortal search enhancements for variant queries.""" 2 | 3 | import asyncio 4 | import logging 5 | from typing import Any 6 | 7 | from pydantic import BaseModel, Field 8 | 9 | from ..utils.cbio_http_adapter import CBioHTTPAdapter 10 | from ..utils.gene_validator import is_valid_gene_symbol, sanitize_gene_symbol 11 | from ..utils.request_cache import request_cache 12 | from .cancer_types import get_cancer_keywords 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | # Cache for frequently accessed data 17 | _cancer_type_cache: dict[str, dict[str, Any]] = {} 18 | _gene_panel_cache: dict[str, list[str]] = {} 19 | 20 | 21 | class GeneHotspot(BaseModel): 22 | """Hotspot mutation information.""" 23 | 24 | position: int 25 | amino_acid_change: str 26 | count: int 27 | frequency: float 28 | cancer_types: list[str] = Field(default_factory=list) 29 | 30 | 31 | class CBioPortalSearchSummary(BaseModel): 32 | """Summary data from cBioPortal for a gene search.""" 33 | 34 | gene: str 35 | total_mutations: int = 0 36 | total_samples_tested: int = 0 37 | mutation_frequency: float = 0.0 38 | hotspots: list[GeneHotspot] = Field(default_factory=list) 39 | cancer_distribution: dict[str, int] = Field(default_factory=dict) 40 | study_coverage: dict[str, Any] = Field(default_factory=dict) 41 | top_studies: list[str] = Field(default_factory=list) 42 | 43 | 44 | class CBioPortalSearchClient: 45 | """Client for cBioPortal search operations.""" 46 | 47 | def __init__(self): 48 | self.http_adapter = CBioHTTPAdapter() 49 | 50 | @request_cache(ttl=900) # Cache for 15 minutes 51 | async def get_gene_search_summary( 52 | self, gene: str, max_studies: int = 10 53 | ) -> CBioPortalSearchSummary | None: 54 | """Get summary statistics for a gene across cBioPortal. 55 | 56 | Args: 57 | gene: Gene symbol (e.g., "BRAF") 58 | max_studies: Maximum number of studies to query 59 | 60 | Returns: 61 | Summary statistics or None if gene not found 62 | """ 63 | # Validate and sanitize gene symbol 64 | if not is_valid_gene_symbol(gene): 65 | logger.warning(f"Invalid gene symbol: {gene}") 66 | return None 67 | 68 | gene = sanitize_gene_symbol(gene) 69 | 70 | try: 71 | # Get gene info first 72 | gene_data, error = await self.http_adapter.get( 73 | f"/genes/{gene}", endpoint_key="cbioportal_genes" 74 | ) 75 | if error or not gene_data: 76 | logger.warning(f"Gene {gene} not found in cBioPortal") 77 | return None 78 | 79 | gene_id = gene_data.get("entrezGeneId") 80 | 81 | if not gene_id: 82 | return None 83 | 84 | # Get cancer type keywords for this gene 85 | cancer_keywords = get_cancer_keywords(gene) 86 | 87 | # Get relevant molecular profiles in parallel with cancer types 88 | profiles_task = self._get_relevant_profiles(gene, cancer_keywords) 89 | cancer_types_task = self._get_cancer_types() 90 | 91 | profiles, cancer_types = await asyncio.gather( 92 | profiles_task, cancer_types_task 93 | ) 94 | 95 | if not profiles: 96 | logger.info(f"No relevant profiles found for {gene}") 97 | return None 98 | 99 | # Query mutations from top studies 100 | selected_profiles = profiles[:max_studies] 101 | mutation_summary = await self._get_mutation_summary( 102 | gene_id, selected_profiles, cancer_types 103 | ) 104 | 105 | # Build summary 106 | summary = CBioPortalSearchSummary( 107 | gene=gene, 108 | total_mutations=mutation_summary.get("total_mutations", 0), 109 | total_samples_tested=mutation_summary.get("total_samples", 0), 110 | mutation_frequency=mutation_summary.get("frequency", 0.0), 111 | hotspots=mutation_summary.get("hotspots", []), 112 | cancer_distribution=mutation_summary.get( 113 | "cancer_distribution", {} 114 | ), 115 | study_coverage={ 116 | "total_studies": len(profiles), 117 | "queried_studies": len(selected_profiles), 118 | "studies_with_data": mutation_summary.get( 119 | "studies_with_data", 0 120 | ), 121 | }, 122 | top_studies=[ 123 | p.get("studyId", "") 124 | for p in selected_profiles 125 | if p.get("studyId") 126 | ][:5], 127 | ) 128 | 129 | return summary 130 | 131 | except TimeoutError: 132 | logger.error( 133 | f"cBioPortal API timeout for gene {gene}. " 134 | "The API may be slow or unavailable. Try again later." 135 | ) 136 | return None 137 | except ConnectionError as e: 138 | logger.error( 139 | f"Network error accessing cBioPortal for gene {gene}: {e}. " 140 | "Check your internet connection." 141 | ) 142 | return None 143 | except Exception as e: 144 | logger.error( 145 | f"Unexpected error getting cBioPortal summary for {gene}: " 146 | f"{type(e).__name__}: {e}. " 147 | "This may be a temporary issue. If it persists, please report it." 148 | ) 149 | return None 150 | 151 | async def _get_cancer_types(self) -> dict[str, dict[str, Any]]: 152 | """Get cancer type hierarchy (cached).""" 153 | if _cancer_type_cache: 154 | return _cancer_type_cache 155 | 156 | try: 157 | cancer_types, error = await self.http_adapter.get( 158 | "/cancer-types", 159 | endpoint_key="cbioportal_cancer_types", 160 | cache_ttl=86400, # Cache for 24 hours 161 | ) 162 | if not error and cancer_types: 163 | # Build lookup by ID 164 | for ct in cancer_types: 165 | ct_id = ct.get("cancerTypeId") 166 | if ct_id: 167 | _cancer_type_cache[ct_id] = ct 168 | return _cancer_type_cache 169 | except Exception as e: 170 | logger.warning(f"Failed to get cancer types: {e}") 171 | 172 | return {} 173 | 174 | async def _get_relevant_profiles( 175 | self, 176 | gene: str, 177 | cancer_keywords: list[str], 178 | ) -> list[dict[str, Any]]: 179 | """Get molecular profiles relevant to the gene.""" 180 | try: 181 | # Get all mutation profiles 182 | all_profiles, error = await self.http_adapter.get( 183 | "/molecular-profiles", 184 | params={"molecularAlterationType": "MUTATION_EXTENDED"}, 185 | endpoint_key="cbioportal_molecular_profiles", 186 | cache_ttl=3600, # Cache for 1 hour 187 | ) 188 | 189 | if error or not all_profiles: 190 | return [] 191 | 192 | # Filter by cancer keywords 193 | relevant_profiles = [] 194 | for profile in all_profiles: 195 | study_id = profile.get("studyId", "").lower() 196 | if any(keyword in study_id for keyword in cancer_keywords): 197 | relevant_profiles.append(profile) 198 | 199 | # Sort by sample count (larger studies first) 200 | # Note: We'd need to fetch study details for actual sample counts 201 | # For now, prioritize known large studies 202 | priority_studies = [ 203 | "msk_impact", 204 | "tcga", 205 | "genie", 206 | "metabric", 207 | "broad", 208 | ] 209 | 210 | def study_priority(profile): 211 | study_id = profile.get("studyId", "").lower() 212 | for i, priority in enumerate(priority_studies): 213 | if priority in study_id: 214 | return i 215 | return len(priority_studies) 216 | 217 | relevant_profiles.sort(key=study_priority) 218 | 219 | return relevant_profiles 220 | 221 | except Exception as e: 222 | logger.warning(f"Failed to get profiles: {e}") 223 | return [] 224 | 225 | async def _get_mutation_summary( 226 | self, 227 | gene_id: int, 228 | profiles: list[dict[str, Any]], 229 | cancer_types: dict[str, dict[str, Any]], 230 | ) -> dict[str, Any]: 231 | """Get mutation summary across selected profiles.""" 232 | # Batch mutations queries for better performance 233 | BATCH_SIZE = ( 234 | 5 # Process 5 profiles at a time to avoid overwhelming the API 235 | ) 236 | 237 | mutation_results = [] 238 | study_ids = [] 239 | 240 | for i in range(0, len(profiles), BATCH_SIZE): 241 | batch = profiles[i : i + BATCH_SIZE] 242 | batch_tasks = [] 243 | batch_study_ids = [] 244 | 245 | for profile in batch: 246 | profile_id = profile.get("molecularProfileId") 247 | study_id = profile.get("studyId") 248 | if profile_id and study_id: 249 | task = self._get_profile_mutations( 250 | gene_id, profile_id, study_id 251 | ) 252 | batch_tasks.append(task) 253 | batch_study_ids.append(study_id) 254 | 255 | if batch_tasks: 256 | # Execute batch in parallel 257 | batch_results = await asyncio.gather( 258 | *batch_tasks, return_exceptions=True 259 | ) 260 | mutation_results.extend(batch_results) 261 | study_ids.extend(batch_study_ids) 262 | 263 | # Small delay between batches to avoid rate limiting 264 | if i + BATCH_SIZE < len(profiles): 265 | await asyncio.sleep(0.05) # 50ms delay 266 | 267 | results = mutation_results 268 | 269 | # Process results using helper function 270 | from .cbioportal_search_helpers import ( 271 | format_hotspots, 272 | process_mutation_results, 273 | ) 274 | 275 | mutation_data = await process_mutation_results( 276 | list(zip(results, study_ids, strict=False)), 277 | cancer_types, 278 | self, 279 | ) 280 | 281 | # Calculate frequency 282 | frequency = ( 283 | mutation_data["total_mutations"] / mutation_data["total_samples"] 284 | if mutation_data["total_samples"] > 0 285 | else 0.0 286 | ) 287 | 288 | # Format hotspots 289 | hotspots = format_hotspots( 290 | mutation_data["hotspot_counts"], mutation_data["total_mutations"] 291 | ) 292 | 293 | return { 294 | "total_mutations": mutation_data["total_mutations"], 295 | "total_samples": mutation_data["total_samples"], 296 | "frequency": frequency, 297 | "hotspots": hotspots, 298 | "cancer_distribution": mutation_data["cancer_distribution"], 299 | "studies_with_data": mutation_data["studies_with_data"], 300 | } 301 | 302 | async def _get_profile_mutations( 303 | self, 304 | gene_id: int, 305 | profile_id: str, 306 | study_id: str, 307 | ) -> dict[str, Any] | None: 308 | """Get mutations for a gene in a specific profile.""" 309 | try: 310 | # Get sample count for the study 311 | samples, samples_error = await self.http_adapter.get( 312 | f"/studies/{study_id}/samples", 313 | params={"projection": "SUMMARY"}, 314 | endpoint_key="cbioportal_studies", 315 | cache_ttl=3600, # Cache for 1 hour 316 | ) 317 | 318 | sample_count = len(samples) if samples and not samples_error else 0 319 | 320 | # Get mutations 321 | mutations, mut_error = await self.http_adapter.get( 322 | f"/molecular-profiles/{profile_id}/mutations", 323 | params={ 324 | "sampleListId": f"{study_id}_all", 325 | "geneIdType": "ENTREZ_GENE_ID", 326 | "geneIds": str(gene_id), 327 | "projection": "SUMMARY", 328 | }, 329 | endpoint_key="cbioportal_mutations", 330 | cache_ttl=900, # Cache for 15 minutes 331 | ) 332 | 333 | if not mut_error and mutations: 334 | return {"mutations": mutations, "sample_count": sample_count} 335 | 336 | except Exception as e: 337 | logger.debug( 338 | f"Failed to get mutations for {profile_id}: {type(e).__name__}" 339 | ) 340 | 341 | return None 342 | 343 | async def _get_study_cancer_type( 344 | self, 345 | study_id: str, 346 | cancer_types: dict[str, dict[str, Any]], 347 | ) -> str: 348 | """Get cancer type name for a study.""" 349 | try: 350 | study, error = await self.http_adapter.get( 351 | f"/studies/{study_id}", 352 | endpoint_key="cbioportal_studies", 353 | cache_ttl=3600, # Cache for 1 hour 354 | ) 355 | if not error and study: 356 | cancer_type_id = study.get("cancerTypeId") 357 | if cancer_type_id and cancer_type_id in cancer_types: 358 | return cancer_types[cancer_type_id].get("name", "Unknown") 359 | elif cancer_type := study.get("cancerType"): 360 | return cancer_type.get("name", "Unknown") 361 | except Exception: 362 | logger.debug(f"Failed to get cancer type for study {study_id}") 363 | 364 | # Fallback: infer from study ID 365 | study_lower = study_id.lower() 366 | if "brca" in study_lower or "breast" in study_lower: 367 | return "Breast Cancer" 368 | elif "lung" in study_lower or "nsclc" in study_lower: 369 | return "Lung Cancer" 370 | elif "coad" in study_lower or "colorectal" in study_lower: 371 | return "Colorectal Cancer" 372 | elif "skcm" in study_lower or "melanoma" in study_lower: 373 | return "Melanoma" 374 | elif "prad" in study_lower or "prostate" in study_lower: 375 | return "Prostate Cancer" 376 | 377 | return "Unknown" 378 | 379 | 380 | def format_cbioportal_search_summary( 381 | summary: CBioPortalSearchSummary | None, 382 | ) -> str: 383 | """Format cBioPortal search summary for display.""" 384 | if not summary: 385 | return "" 386 | 387 | lines = [ 388 | f"\n### cBioPortal Summary for {summary.gene}", 389 | f"- **Mutation Frequency**: {summary.mutation_frequency:.1%} ({summary.total_mutations:,} mutations in {summary.total_samples_tested:,} samples)", 390 | f"- **Studies**: {summary.study_coverage.get('studies_with_data', 0)} of {summary.study_coverage.get('queried_studies', 0)} studies have mutations", 391 | ] 392 | 393 | if summary.hotspots: 394 | lines.append("\n**Top Hotspots:**") 395 | for hs in summary.hotspots[:3]: 396 | lines.append( 397 | f"- {hs.amino_acid_change}: {hs.count} cases ({hs.frequency:.1%}) in {', '.join(hs.cancer_types[:3])}" 398 | ) 399 | 400 | if summary.cancer_distribution: 401 | lines.append("\n**Cancer Type Distribution:**") 402 | for cancer_type, count in sorted( 403 | summary.cancer_distribution.items(), 404 | key=lambda x: x[1], 405 | reverse=True, 406 | )[:5]: 407 | lines.append(f"- {cancer_type}: {count} mutations") 408 | 409 | return "\n".join(lines) 410 | ```