This is page 5 of 15. Use http://codebase.md/genomoncology/biomcp?page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /tests/tdd/test_drug_recalls.py: -------------------------------------------------------------------------------- ```python """Tests for FDA drug recalls module.""" import json from pathlib import Path from unittest.mock import AsyncMock, patch import pytest from biomcp.openfda.drug_recalls import ( get_drug_recall, search_drug_recalls, ) # Load mock data MOCK_DIR = Path(__file__).parent.parent / "data" / "openfda" MOCK_RECALLS_SEARCH = json.loads( (MOCK_DIR / "enforcement_search.json").read_text() ) MOCK_RECALL_DETAIL = json.loads( (MOCK_DIR / "enforcement_detail.json").read_text() ) class TestDrugRecalls: """Test drug recalls functionality.""" @pytest.mark.asyncio async def test_search_drug_recalls_success(self): """Test successful drug recall search.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALLS_SEARCH, None) result = await search_drug_recalls( drug="valsartan", limit=10, ) assert "Drug Recall" in result or "FDA Drug Recall" in result assert "valsartan" in result.lower() # Check for presence of key recall info assert "Recall" in result or "recall" in result.lower() mock_request.assert_called_once() @pytest.mark.asyncio async def test_search_drug_recalls_with_filters(self): """Test drug recall search with multiple filters.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALLS_SEARCH, None) result = await search_drug_recalls( drug="metformin", recall_class="2", status="ongoing", reason="contamination", since_date="20230101", limit=5, api_key="test-key", ) assert "Drug Recall" in result or "FDA Drug Recall" in result # Verify API key was passed as the 4th positional argument call_args = mock_request.call_args assert ( call_args[0][3] == "test-key" ) # api_key is 4th positional arg @pytest.mark.asyncio async def test_search_drug_recalls_no_results(self): """Test drug recall search with no results.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = ({"results": []}, None) result = await search_drug_recalls(drug="nonexistent-drug") assert "No drug recall records found" in result @pytest.mark.asyncio async def test_search_drug_recalls_api_error(self): """Test drug recall search with API error.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (None, "API rate limit exceeded") result = await search_drug_recalls(drug="test") assert "Error searching drug recalls" in result assert "API rate limit exceeded" in result @pytest.mark.asyncio async def test_get_drug_recall_success(self): """Test getting specific drug recall details.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALL_DETAIL, None) result = await get_drug_recall("D-0001-2023") assert "Drug Recall" in result or "D-0001-2023" in result assert "D-0001-2023" in result # Check for key details in the output (formats may vary) assert "product" in result.lower() or "valsartan" in result.lower() @pytest.mark.asyncio async def test_get_drug_recall_not_found(self): """Test getting drug recall that doesn't exist.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = ({"results": []}, None) result = await get_drug_recall("INVALID-RECALL") assert "No recall record found" in result assert "INVALID-RECALL" in result @pytest.mark.asyncio async def test_get_drug_recall_with_api_key(self): """Test getting drug recall with API key.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALL_DETAIL, None) result = await get_drug_recall( "D-0001-2023", api_key="test-api-key", ) assert "Drug Recall" in result or "D-0001-2023" in result # Verify API key was passed as the 4th positional argument call_args = mock_request.call_args assert ( call_args[0][3] == "test-api-key" ) # api_key is 4th positional arg @pytest.mark.asyncio async def test_recall_class_validation(self): """Test that recall class is validated.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALLS_SEARCH, None) # Valid recall classes for recall_class in ["1", "2", "3"]: result = await search_drug_recalls(recall_class=recall_class) assert "Drug Recall" in result or "FDA Drug Recall" in result # Test with Class I, II, III format result = await search_drug_recalls(recall_class="Class I") call_args = mock_request.call_args params = call_args[0][1] # params is 2nd positional arg assert 'classification:"Class I"' in params["search"] @pytest.mark.asyncio async def test_recall_status_mapping(self): """Test that recall status is properly mapped.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALLS_SEARCH, None) # Test ongoing status await search_drug_recalls(status="ongoing") call_args = mock_request.call_args params = call_args[0][1] # params is 2nd positional arg assert "Ongoing" in params["search"] # Test completed status await search_drug_recalls(status="completed") call_args = mock_request.call_args params = call_args[0][1] # params is 2nd positional arg assert "Completed" in params["search"] @pytest.mark.asyncio async def test_search_drug_recalls_pagination(self): """Test drug recall search pagination.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_response = { "meta": {"results": {"total": 150}}, "results": MOCK_RECALLS_SEARCH["results"], } mock_request.return_value = (mock_response, None) result = await search_drug_recalls( drug="aspirin", limit=10, skip=30, ) # Check for total count instead of specific pagination format assert "150" in result # Verify skip parameter was passed call_args = mock_request.call_args assert ( call_args[0][1]["skip"] == "30" ) # params is 2nd positional arg, value is string @pytest.mark.asyncio async def test_date_filtering(self): """Test that date filtering works correctly.""" with patch( "biomcp.openfda.drug_recalls.make_openfda_request", new_callable=AsyncMock, ) as mock_request: mock_request.return_value = (MOCK_RECALLS_SEARCH, None) await search_drug_recalls( since_date="20230615", ) # Check that date was properly formatted in query call_args = mock_request.call_args params = call_args[0][1] # params is 2nd positional arg assert "recall_initiation_date" in params["search"] assert "[2023-06-15 TO *]" in params["search"] ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/validation.py: -------------------------------------------------------------------------------- ```python """Validation functions for OpenFDA API responses.""" import logging from typing import Any from .exceptions import OpenFDAValidationError logger = logging.getLogger(__name__) def validate_fda_response( response: dict[str, Any], required_fields: list[str] | None = None, response_type: str = "generic", ) -> bool: """ Validate FDA API response structure. Args: response: The FDA API response dictionary required_fields: List of required top-level fields response_type: Type of response for specific validation Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ if not isinstance(response, dict): raise OpenFDAValidationError( f"Expected dict response, got {type(response).__name__}" ) # Default required fields for most FDA responses if required_fields is None: required_fields = ["results"] if "results" in response else [] # Check required fields missing_fields = [ field for field in required_fields if field not in response ] if missing_fields: raise OpenFDAValidationError( f"Missing required fields in FDA response: {', '.join(missing_fields)}" ) # Type-specific validation if response_type == "search": validate_search_response(response) elif response_type == "detail": validate_detail_response(response) return True def validate_search_response(response: dict[str, Any]) -> bool: """ Validate FDA search response structure. Args: response: FDA search response Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ # Search responses should have results array if "results" not in response: raise OpenFDAValidationError("Search response missing 'results' field") if not isinstance(response["results"], list): raise OpenFDAValidationError( f"Expected 'results' to be a list, got {type(response['results']).__name__}" ) # If meta is present, validate it if "meta" in response: validate_meta_field(response["meta"]) return True def validate_detail_response(response: dict[str, Any]) -> bool: """ Validate FDA detail response structure. Args: response: FDA detail response Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ # Detail responses usually have a single result if "results" in response: if not isinstance(response["results"], list): raise OpenFDAValidationError( f"Expected 'results' to be a list, got {type(response['results']).__name__}" ) if len(response["results"]) == 0: # Empty results is valid (not found) return True if len(response["results"]) > 1: logger.warning( f"Detail response contains {len(response['results'])} results, expected 1" ) return True def validate_meta_field(meta: dict[str, Any]) -> bool: """ Validate FDA response meta field. Args: meta: Meta field from FDA response Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ if not isinstance(meta, dict): raise OpenFDAValidationError( f"Expected 'meta' to be a dict, got {type(meta).__name__}" ) # Check for results metadata if "results" in meta: results_meta = meta["results"] if not isinstance(results_meta, dict): raise OpenFDAValidationError( f"Expected 'meta.results' to be a dict, got {type(results_meta).__name__}" ) # Validate pagination fields if present for field in ["skip", "limit", "total"]: if field in results_meta and not isinstance( results_meta[field], int | float ): raise OpenFDAValidationError( f"Expected 'meta.results.{field}' to be numeric, " f"got {type(results_meta[field]).__name__}" ) return True def validate_adverse_event(event: dict[str, Any]) -> bool: """ Validate an adverse event record. Args: event: Adverse event record Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ if not isinstance(event, dict): raise OpenFDAValidationError( f"Expected adverse event to be a dict, got {type(event).__name__}" ) # Key fields that should be present (but may be null) important_fields = ["patient", "safetyreportid"] for field in important_fields: if field not in event: logger.warning(f"Adverse event missing expected field: {field}") return True def validate_drug_label(label: dict[str, Any]) -> bool: """ Validate a drug label record. Args: label: Drug label record Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ if not isinstance(label, dict): raise OpenFDAValidationError( f"Expected drug label to be a dict, got {type(label).__name__}" ) # Labels should have OpenFDA section if "openfda" not in label: logger.warning("Drug label missing 'openfda' section") # Should have at least one section label_sections = [ "indications_and_usage", "contraindications", "warnings_and_precautions", "adverse_reactions", "dosage_and_administration", ] has_section = any(section in label for section in label_sections) if not has_section: logger.warning("Drug label has no standard sections") return True def validate_device_event(event: dict[str, Any]) -> bool: """ Validate a device event record. Args: event: Device event record Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ if not isinstance(event, dict): raise OpenFDAValidationError( f"Expected device event to be a dict, got {type(event).__name__}" ) # Device events should have MDR report key if "mdr_report_key" not in event: logger.warning("Device event missing 'mdr_report_key'") # Should have device information if "device" not in event and "devices" not in event: logger.warning("Device event missing device information") return True def validate_recall(recall: dict[str, Any]) -> bool: """ Validate a recall record. Args: recall: Recall record Returns: True if valid Raises: OpenFDAValidationError: If validation fails """ if not isinstance(recall, dict): raise OpenFDAValidationError( f"Expected recall to be a dict, got {type(recall).__name__}" ) # Required fields for recalls required = ["recall_number", "classification", "product_description"] for field in required: if field not in recall: logger.warning(f"Recall missing required field: {field}") # Validate classification if present if "classification" in recall: valid_classes = ["Class I", "Class II", "Class III", "1", "2", "3"] if recall["classification"] not in valid_classes: logger.warning( f"Invalid recall classification: {recall['classification']}" ) return True def sanitize_response(response: dict[str, Any]) -> dict[str, Any]: """ Sanitize FDA response to handle common issues. Args: response: Raw FDA response Returns: Sanitized response """ if not response: return {} # Handle fields that can be string or list if "results" in response and isinstance(response["results"], list): for result in response["results"]: if isinstance(result, dict): # Fields that can be string or list polymorphic_fields = [ "source_type", "remedial_action", "medical_specialty_description", "manufacturer_name", "brand_name", "generic_name", ] for field in polymorphic_fields: if field in result: value = result[field] # Ensure consistent list format if not isinstance(value, list): result[field] = [value] if value else [] return response ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/input_validation.py: -------------------------------------------------------------------------------- ```python """ Input validation and sanitization for OpenFDA API requests. This module provides security-focused input validation to prevent injection attacks and ensure data integrity for all FDA API requests. """ import logging import re from typing import Any logger = logging.getLogger(__name__) # Maximum lengths for different input types MAX_DRUG_NAME_LENGTH = 100 MAX_REACTION_LENGTH = 200 MAX_GENERAL_QUERY_LENGTH = 500 MAX_DATE_LENGTH = 10 # Patterns for validation SAFE_CHARS_PATTERN = re.compile(r"^[a-zA-Z0-9\s\-\.\,\(\)\/\*]+$") DATE_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}$") # Include SQL comment pattern -- and other injection patterns INJECTION_CHARS = re.compile(r"[<>\"\';&|\\`${}]|--") def sanitize_input( value: str | None, max_length: int = MAX_GENERAL_QUERY_LENGTH ) -> str | None: """ Sanitize user input to prevent injection attacks. Args: value: Input string to sanitize max_length: Maximum allowed length Returns: Sanitized string or None if input is invalid """ if not value: return None # Convert to string and strip whitespace value = str(value).strip() # Check length if len(value) > max_length: logger.warning( f"Input truncated from {len(value)} to {max_length} characters" ) value = value[:max_length] # Remove potential injection characters cleaned = INJECTION_CHARS.sub("", value) # Warn if characters were removed if cleaned != value: logger.warning("Removed potentially dangerous characters from input") # Normalize whitespace cleaned = " ".join(cleaned.split()) return cleaned if cleaned else None def validate_drug_name(drug: str | None) -> str | None: """ Validate and sanitize drug name input. Args: drug: Drug name to validate Returns: Validated drug name or None """ if not drug: return None sanitized = sanitize_input(drug, MAX_DRUG_NAME_LENGTH) if not sanitized: return None # Drug names should only contain alphanumeric, spaces, hyphens, and slashes if not re.match(r"^[a-zA-Z0-9\s\-\/\(\)]+$", sanitized): logger.warning(f"Invalid drug name format: {sanitized[:20]}...") return None return sanitized def validate_date(date_str: str | None) -> str | None: """ Validate date string format. Args: date_str: Date string in YYYY-MM-DD format Returns: Validated date string or None """ if not date_str: return None sanitized = sanitize_input(date_str, MAX_DATE_LENGTH) if not sanitized: return None # Check date format if not DATE_PATTERN.match(sanitized): logger.warning(f"Invalid date format: {sanitized}") return None # Basic date validation try: year, month, day = map(int, sanitized.split("-")) if not (1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31): logger.warning(f"Date out of valid range: {sanitized}") return None except (ValueError, IndexError): logger.warning(f"Cannot parse date: {sanitized}") return None return sanitized def validate_limit(limit: int | None, max_limit: int = 100) -> int: """ Validate and constrain limit parameter. Args: limit: Requested limit max_limit: Maximum allowed limit Returns: Valid limit value """ if limit is None: return 25 # Default try: limit = int(limit) except (ValueError, TypeError): logger.warning(f"Invalid limit value: {limit}") return 25 if limit < 1: return 1 elif limit > max_limit: logger.warning(f"Limit {limit} exceeds maximum {max_limit}") return max_limit return limit def validate_skip(skip: int | None, max_skip: int = 10000) -> int: """ Validate and constrain skip/offset parameter. Args: skip: Requested skip/offset max_skip: Maximum allowed skip Returns: Valid skip value """ if skip is None: return 0 try: skip = int(skip) except (ValueError, TypeError): logger.warning(f"Invalid skip value: {skip}") return 0 if skip < 0: return 0 elif skip > max_skip: logger.warning(f"Skip {skip} exceeds maximum {max_skip}") return max_skip return skip def validate_classification(classification: str | None) -> str | None: """ Validate recall classification. Args: classification: Classification string (Class I, II, or III) Returns: Validated classification or None """ if not classification: return None sanitized = sanitize_input(classification, 20) if not sanitized: return None # Normalize classification format sanitized = sanitized.upper() # Check valid classifications valid_classes = [ "CLASS I", "CLASS II", "CLASS III", "I", "II", "III", "1", "2", "3", ] if sanitized not in valid_classes: logger.warning(f"Invalid classification: {sanitized}") return None # Normalize to standard format if sanitized in ["I", "1"]: return "Class I" elif sanitized in ["II", "2"]: return "Class II" elif sanitized in ["III", "3"]: return "Class III" return sanitized.title() # "CLASS I" -> "Class I" def validate_status(status: str | None) -> str | None: """ Validate status parameter. Args: status: Status string Returns: Validated status or None """ if not status: return None sanitized = sanitize_input(status, 50) if not sanitized: return None # Normalize status sanitized = sanitized.lower() # Check valid statuses valid_statuses = [ "ongoing", "terminated", "completed", "current", "resolved", ] if sanitized not in valid_statuses: logger.warning(f"Invalid status: {sanitized}") return None return sanitized.title() # "ongoing" -> "Ongoing" def validate_boolean(value: Any) -> bool | None: """ Validate boolean parameter. Args: value: Boolean-like value Returns: Boolean value or None """ if value is None: return None if isinstance(value, bool): return value if isinstance(value, str): value = value.lower().strip() if value in ["true", "1", "yes", "y"]: return True elif value in ["false", "0", "no", "n"]: return False return None def validate_api_key(api_key: str | None) -> str | None: """ Validate API key format. Args: api_key: API key string Returns: Validated API key or None """ if not api_key: return None # API keys should be alphanumeric with possible hyphens if not re.match(r"^[a-zA-Z0-9\-_]+$", api_key): logger.warning("Invalid API key format") return None # Check reasonable length if len(api_key) < 10 or len(api_key) > 100: logger.warning("API key length out of expected range") return None return api_key def _validate_parameter(key: str, value: Any) -> Any: """Validate a single parameter based on its key.""" if key in ["drug", "brand", "generic"]: return validate_drug_name(value) elif key in ["limit"]: return validate_limit(value) elif key in ["skip", "offset"]: return validate_skip(value) elif key in ["classification"]: return validate_classification(value) elif key in ["status"]: return validate_status(value) elif key in ["serious", "death", "ongoing"]: return validate_boolean(value) elif key in ["api_key"]: return validate_api_key(value) elif "date" in key.lower(): return validate_date(value) else: return sanitize_input(value) def build_safe_query(params: dict[str, Any]) -> dict[str, Any]: """ Build a safe query dictionary with validated parameters. Args: params: Raw parameters dictionary Returns: Dictionary with validated parameters """ safe_params = {} for key, value in params.items(): if value is None: continue # Validate key name if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", key): logger.warning(f"Skipping invalid parameter key: {key}") continue # Validate parameter value validated = _validate_parameter(key, value) if validated is not None: safe_params[key] = validated return safe_params ``` -------------------------------------------------------------------------------- /tests/tdd/openfda/test_device_events.py: -------------------------------------------------------------------------------- ```python """ Unit tests for OpenFDA device events integration. """ from unittest.mock import patch import pytest from biomcp.openfda.device_events import get_device_event, search_device_events @pytest.mark.asyncio async def test_search_device_events_by_device(): """Test searching device events by device name.""" mock_response = { "meta": {"results": {"total": 3}}, "results": [ { "event_type": "M", "date_received": "2024-01-15", "device": [ { "brand_name": "FoundationOne CDx", "manufacturer_d_name": "Foundation Medicine", "model_number": "F1CDX", "device_problem_text": ["False negative result"], "openfda": { "device_class": "2", "medical_specialty_description": ["Pathology"], "product_code": "PQP", }, } ], "event_description": "Device failed to detect known mutation", "mdr_report_key": "MDR123456", } ], } with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = (mock_response, None) result = await search_device_events(device="FoundationOne", limit=10) # Verify request mock_request.assert_called_once() call_args = mock_request.call_args assert "FoundationOne" in call_args[0][1]["search"] # When searching for a specific device, genomic filter is not needed # The device search itself is sufficient # Check output assert "FDA Device Adverse Event Reports" in result assert "FoundationOne CDx" in result assert "Foundation Medicine" in result assert "False negative result" in result assert "Malfunction" in result assert "MDR123456" in result @pytest.mark.asyncio async def test_search_device_events_genomics_filter(): """Test that genomics filter is applied by default.""" mock_response = {"meta": {"results": {"total": 5}}, "results": []} with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = (mock_response, None) await search_device_events(manufacturer="Illumina", genomics_only=True) # Verify genomic device codes are in search call_args = mock_request.call_args search_query = call_args[0][1]["search"] # Should contain at least one genomic product code assert any( code in search_query for code in ["OOI", "PQP", "OYD", "NYE"] ) @pytest.mark.asyncio async def test_search_device_events_no_genomics_filter(): """Test searching without genomics filter.""" mock_response = {"meta": {"results": {"total": 10}}, "results": []} with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = (mock_response, None) await search_device_events(device="pacemaker", genomics_only=False) # Verify no genomic product codes in search call_args = mock_request.call_args search_query = call_args[0][1]["search"] # Should not contain genomic product codes assert not any(code in search_query for code in ["OOI", "PQP", "OYD"]) @pytest.mark.asyncio async def test_search_device_events_by_problem(): """Test searching device events by problem description.""" mock_response = { "meta": {"results": {"total": 8}}, "results": [ { "event_type": "IN", "device": [ { "brand_name": "Test Device", "device_problem_text": [ "Software malfunction", "Data loss", ], } ], "mdr_report_key": "MDR789", } ], } with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = (mock_response, None) result = await search_device_events(problem="software malfunction") # Verify request call_args = mock_request.call_args assert "software malfunction" in call_args[0][1]["search"].lower() # Check output assert "Software malfunction" in result assert "Data loss" in result assert "Injury" in result # IN = Injury @pytest.mark.asyncio async def test_search_device_events_no_params(): """Test that searching without parameters returns helpful message.""" result = await search_device_events() assert "Please specify" in result assert "device name, manufacturer, or problem" in result assert "Examples:" in result @pytest.mark.asyncio async def test_get_device_event_detail(): """Test getting detailed device event report.""" mock_response = { "results": [ { "mdr_report_key": "MDR999888", "event_type": "D", "date_received": "2024-02-01", "date_of_event": "2024-01-20", "source_type": "M", "device": [ { "brand_name": "Genomic Sequencer X", "manufacturer_d_name": "GenTech Corp", "model_number": "GSX-2000", "catalog_number": "CAT123", "lot_number": "LOT456", "expiration_date_of_device": "2025-12-31", "device_problem_text": [ "Critical failure", "Sample contamination", ], "device_evaluated_by_manufacturer": "Y", "openfda": { "device_class": "3", "medical_specialty_description": [ "Clinical Chemistry" ], "product_code": "OOI", }, } ], "event_description": "Device failure led to incorrect cancer diagnosis", "manufacturer_narrative": "Investigation revealed component failure", "patient": [ { "patient_age": "65", "patient_sex": "F", "date_of_death": "2024-01-25", "life_threatening": "Y", } ], "remedial_action": "Device recall initiated", } ] } with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = (mock_response, None) result = await get_device_event("MDR999888") # Verify request mock_request.assert_called_once() call_args = mock_request.call_args assert "MDR999888" in call_args[0][1]["search"] # Check detailed output assert "MDR999888" in result assert "Death" in result assert "Genomic Sequencer X" in result assert "GenTech Corp" in result assert "GSX-2000" in result assert "Critical failure" in result assert "Sample contamination" in result assert "Class III" in result assert "65 years" in result assert "Female" in result assert "2024-01-25" in result assert "Life-threatening" in result assert "Device recall initiated" in result assert "Investigation revealed component failure" in result @pytest.mark.asyncio async def test_get_device_event_not_found(): """Test handling when device event report is not found.""" with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = ({"results": []}, None) result = await get_device_event("NOTFOUND789") assert "NOTFOUND789" in result assert "not found" in result @pytest.mark.asyncio async def test_search_device_events_error(): """Test error handling in device event search.""" with patch( "biomcp.openfda.device_events.make_openfda_request" ) as mock_request: mock_request.return_value = (None, "Network timeout") result = await search_device_events(device="test") assert "Error searching device events" in result assert "Network timeout" in result ``` -------------------------------------------------------------------------------- /docs/reference/quick-reference.md: -------------------------------------------------------------------------------- ```markdown # BioMCP Quick Reference ## Command Cheat Sheet ### Installation ```bash # Install BioMCP uv tool install biomcp # Update to latest version uv tool install biomcp --force # Check version biomcp --version ``` ### Article Search Commands ```bash # Basic gene search biomcp article search --gene BRAF # Multiple filters biomcp article search \ --gene EGFR --disease "lung cancer" \ --chemical erlotinib # Exclude preprints biomcp article search --gene TP53 --no-preprints # OR logic in keywords biomcp article search --gene PTEN \ --keyword "R173|Arg173|p.R173" # Get specific article biomcp article get 38768446 # PMID biomcp article get "10.1101/2024.01.20.23288905" # DOI ``` ### Trial Search Commands ```bash # Basic disease search biomcp trial search \ --condition melanoma --status RECRUITING # Location-based search (requires coordinates) biomcp trial search --condition cancer \ --latitude 40.7128 --longitude -74.0060 --distance 50 # Phase-specific search biomcp trial search \ --condition "breast cancer" --phase PHASE3 # Using NCI source (requires API key) biomcp trial search --condition melanoma --source nci \ --required-mutations "BRAF V600E" --api-key $NCI_API_KEY ``` ### Variant Commands ```bash # Search by gene biomcp variant search \ --gene BRCA1 --significance pathogenic # Search by HGVS biomcp variant search --hgvs "NM_007294.4:c.5266dupC" # Search by frequency biomcp variant search --gene TP53 \ --max-frequency 0.01 --min-cadd 20 # Get variant details biomcp variant get rs121913529 biomcp variant get "NM_007294.4:c.5266dupC" # Predict effects (requires AlphaGenome key) biomcp variant predict chr7 140753336 A T --tissue UBERON:0002367 ``` ### Gene/Drug/Disease Commands ```bash # Get gene information biomcp gene get TP53 biomcp gene get BRAF # Get drug information biomcp drug get imatinib biomcp drug get pembrolizumab # Get disease information biomcp disease get melanoma biomcp disease get "non-small cell lung cancer" ``` ### NCI Commands (Require API Key) ```bash # Search organizations biomcp organization search --name "MD Anderson" \ --city Houston --state TX --api-key $NCI_API_KEY # Search interventions biomcp intervention search --name pembrolizumab \ --intervention-type Drug --api-key $NCI_API_KEY # Search biomarkers biomcp biomarker search --gene EGFR \ --biomarker-type mutation --api-key $NCI_API_KEY ``` ### Health Check ```bash # Full health check biomcp health check # Check APIs only biomcp health check --apis-only # Verbose output biomcp health check --verbose ``` ## Common Parameter Reference ### Search Parameters | Parameter | Description | Example | | ---------- | ------------- | --------------- | | `--limit` | Max results | `--limit 20` | | `--page` | Page number | `--page 2` | | `--format` | Output format | `--format json` | ### Trial Status Values | Status | Description | | ----------------------- | ---------------------- | | `RECRUITING` | Currently enrolling | | `ACTIVE_NOT_RECRUITING` | Ongoing, not enrolling | | `NOT_YET_RECRUITING` | Will start recruiting | | `COMPLETED` | Trial has ended | | `SUSPENDED` | Temporarily halted | | `TERMINATED` | Stopped early | ### Trial Phase Values | Phase | Description | | -------------- | ------------- | | `EARLY_PHASE1` | Early Phase 1 | | `PHASE1` | Phase 1 | | `PHASE2` | Phase 2 | | `PHASE3` | Phase 3 | | `PHASE4` | Phase 4 | ### Clinical Significance | Value | Description | | ------------------------ | ----------------------- | | `pathogenic` | Causes disease | | `likely_pathogenic` | Probably causes disease | | `uncertain_significance` | Unknown impact | | `likely_benign` | Probably harmless | | `benign` | Does not cause disease | ## Gene Symbol Quick Lookup ### Common Gene Aliases | Common Name | Official Symbol | | ----------- | --------------- | | HER2 | ERBB2 | | HER3 | ERBB3 | | EGFR | EGFR | | ALK | ALK | | c-MET | MET | | PD-1 | PDCD1 | | PD-L1 | CD274 | | CTLA-4 | CTLA4 | ## Location Coordinates ### Major US Cities | City | Latitude | Longitude | | ------------- | -------- | --------- | | New York | 40.7128 | -74.0060 | | Los Angeles | 34.0522 | -118.2437 | | Chicago | 41.8781 | -87.6298 | | Houston | 29.7604 | -95.3698 | | Philadelphia | 39.9526 | -75.1652 | | Boston | 42.3601 | -71.0589 | | Atlanta | 33.7490 | -84.3880 | | Miami | 25.7617 | -80.1918 | | Seattle | 47.6062 | -122.3321 | | San Francisco | 37.7749 | -122.4194 | ## Environment Variables ```bash # API Keys export NCI_API_KEY="your-nci-key" export ALPHAGENOME_API_KEY="your-alphagenome-key" export CBIO_TOKEN="your-cbioportal-token" # Configuration export BIOMCP_LOG_LEVEL="DEBUG" export BIOMCP_CACHE_DIR="/path/to/cache" export BIOMCP_TIMEOUT=300 export BIOMCP_MAX_CONCURRENT=5 ``` ## Output Format Examples ### JSON Output ```bash biomcp article search --gene BRAF --format json | jq '.articles[0]' ``` ### Extract Specific Fields ```bash # Get PMIDs only biomcp article search --gene TP53 --format json | \ jq -r '.articles[].pmid' # Get trial NCT IDs biomcp trial search --condition melanoma --format json | \ jq -r '.trials[].nct_id' ``` ### Save to File ```bash biomcp article search --gene BRCA1 --format json > results.json ``` ## MCP Tool Names ### Core Tools - `search` - Unified search - `fetch` - Get details - `think` - Sequential thinking ### Article Tools - `article_searcher` - `article_getter` ### Trial Tools - `trial_searcher` - `trial_getter` - `trial_protocol_getter` - `trial_references_getter` - `trial_outcomes_getter` - `trial_locations_getter` ### Variant Tools - `variant_searcher` - `variant_getter` - `alphagenome_predictor` ### BioThings Tools - `gene_getter` - `disease_getter` - `drug_getter` ### NCI Tools - `nci_organization_searcher` - `nci_organization_getter` - `nci_intervention_searcher` - `nci_intervention_getter` - `nci_biomarker_searcher` - `nci_disease_searcher` ## Query Language Syntax ### Unified Search Examples ``` gene:BRAF AND disease:melanoma gene:EGFR AND (mutation OR variant) drugs.tradename:gleevec diseases.name:"lung cancer" chemicals.mesh:D000069439 ``` ### Field Prefixes - `gene:` - Gene symbol - `disease:` - Disease/condition - `chemical:` - Drug/chemical - `variant:` - Genetic variant - `pmid:` - PubMed ID - `doi:` - Digital Object ID ## Common Workflows ### Find Articles About a Mutation ```bash # Step 1: Search articles biomcp article search --gene BRAF --keyword "V600E|p.V600E" # Step 2: Get full article biomcp article get [PMID] ``` ### Check Trial Eligibility ```bash # Step 1: Search trials biomcp trial search --condition melanoma --status RECRUITING # Step 2: Get trial details biomcp trial get NCT03006926 ``` ### Variant Analysis ```bash # Step 1: Search variant biomcp variant search --gene BRCA1 --significance pathogenic # Step 2: Get variant details biomcp variant get rs80357906 # Step 3: Search related articles biomcp article search --gene BRCA1 --variant rs80357906 ``` ## Error Code Quick Reference ### Common HTTP Codes - `400` - Bad request (check parameters) - `401` - Unauthorized (check API key) - `404` - Not found (verify ID) - `429` - Rate limited (wait and retry) - `500` - Server error (retry later) ### BioMCP Error Patterns - `1xxx` - Article errors - `2xxx` - Trial errors - `3xxx` - Variant errors - `4xxx` - Gene/drug/disease errors - `5xxx` - Authentication errors - `6xxx` - Rate limit errors - `7xxx` - Validation errors ## Tips and Tricks ### 1. Use Official Gene Symbols ```bash # Wrong biomcp article search --gene HER2 # ❌ # Right biomcp article search --gene ERBB2 # ✅ ``` ### 2. Combine Multiple Searches ```bash # Search multiple databases in parallel ( biomcp article search --gene BRAF --format json > articles.json & biomcp trial search --condition melanoma --format json > trials.json & biomcp variant search --gene BRAF --format json > variants.json & wait ) ``` ### 3. Process Large Results ```bash # Paginate through results for page in {1..10}; do biomcp article search --gene TP53 --page $page --limit 100 done ``` ### 4. Debug API Issues ```bash # Enable debug logging export BIOMCP_LOG_LEVEL=DEBUG biomcp article search --gene BRAF --verbose ``` ## Getting Help ```bash # General help biomcp --help # Command help biomcp article search --help # Check documentation open https://biomcp.org/ # Report issues open https://github.com/genomoncology/biomcp/issues ``` ``` -------------------------------------------------------------------------------- /tests/tdd/test_retry.py: -------------------------------------------------------------------------------- ```python """Tests for retry logic with exponential backoff.""" import asyncio from unittest.mock import AsyncMock, MagicMock, patch import httpx import pytest from biomcp.retry import ( RetryableHTTPError, RetryConfig, calculate_delay, is_retryable_exception, is_retryable_status, retry_with_backoff, with_retry, ) def test_calculate_delay_exponential_backoff(): """Test that delay increases exponentially.""" config = RetryConfig(initial_delay=1.0, exponential_base=2.0, jitter=False) # Test exponential increase assert calculate_delay(0, config) == 1.0 # 1 * 2^0 assert calculate_delay(1, config) == 2.0 # 1 * 2^1 assert calculate_delay(2, config) == 4.0 # 1 * 2^2 assert calculate_delay(3, config) == 8.0 # 1 * 2^3 def test_calculate_delay_max_cap(): """Test that delay is capped at max_delay.""" config = RetryConfig( initial_delay=1.0, exponential_base=2.0, max_delay=5.0, jitter=False ) # Test that delay is capped assert calculate_delay(0, config) == 1.0 assert calculate_delay(1, config) == 2.0 assert calculate_delay(2, config) == 4.0 assert calculate_delay(3, config) == 5.0 # Capped at max_delay assert calculate_delay(10, config) == 5.0 # Still capped def test_calculate_delay_with_jitter(): """Test that jitter adds randomness to delay.""" config = RetryConfig(initial_delay=10.0, jitter=True) # Generate multiple delays and check they're different delays = [calculate_delay(1, config) for _ in range(10)] # All should be around 20.0 (10 * 2^1) with jitter for delay in delays: assert 18.0 <= delay <= 22.0 # Within 10% jitter range # Should have some variation assert len(set(delays)) > 1 def test_is_retryable_exception(): """Test exception retryability check.""" config = RetryConfig(retryable_exceptions=(ConnectionError, TimeoutError)) # Retryable exceptions assert is_retryable_exception(ConnectionError("test"), config) assert is_retryable_exception(TimeoutError("test"), config) # Non-retryable exceptions assert not is_retryable_exception(ValueError("test"), config) assert not is_retryable_exception(KeyError("test"), config) def test_is_retryable_status(): """Test HTTP status code retryability check.""" config = RetryConfig(retryable_status_codes=(429, 502, 503, 504)) # Retryable status codes assert is_retryable_status(429, config) assert is_retryable_status(502, config) assert is_retryable_status(503, config) assert is_retryable_status(504, config) # Non-retryable status codes assert not is_retryable_status(200, config) assert not is_retryable_status(404, config) assert not is_retryable_status(500, config) @pytest.mark.asyncio async def test_with_retry_decorator_success(): """Test retry decorator with successful call.""" call_count = 0 @with_retry(RetryConfig(max_attempts=3)) async def test_func(): nonlocal call_count call_count += 1 return "success" result = await test_func() assert result == "success" assert call_count == 1 # Should succeed on first try @pytest.mark.asyncio async def test_with_retry_decorator_eventual_success(): """Test retry decorator with eventual success.""" call_count = 0 @with_retry( RetryConfig( max_attempts=3, initial_delay=0.01, # Fast for testing retryable_exceptions=(ValueError,), ) ) async def test_func(): nonlocal call_count call_count += 1 if call_count < 3: raise ValueError("Transient error") return "success" result = await test_func() assert result == "success" assert call_count == 3 @pytest.mark.asyncio async def test_with_retry_decorator_max_attempts_exceeded(): """Test retry decorator when max attempts exceeded.""" call_count = 0 @with_retry( RetryConfig( max_attempts=3, initial_delay=0.01, retryable_exceptions=(ConnectionError,), ) ) async def test_func(): nonlocal call_count call_count += 1 raise ConnectionError("Persistent error") with pytest.raises(ConnectionError, match="Persistent error"): await test_func() assert call_count == 3 @pytest.mark.asyncio async def test_with_retry_non_retryable_exception(): """Test retry decorator with non-retryable exception.""" call_count = 0 @with_retry( RetryConfig(max_attempts=3, retryable_exceptions=(ConnectionError,)) ) async def test_func(): nonlocal call_count call_count += 1 raise ValueError("Non-retryable error") with pytest.raises(ValueError, match="Non-retryable error"): await test_func() assert call_count == 1 # Should not retry @pytest.mark.asyncio async def test_retry_with_backoff_function(): """Test retry_with_backoff function.""" call_count = 0 async def test_func(value): nonlocal call_count call_count += 1 if call_count < 2: raise TimeoutError("Timeout") return f"result: {value}" config = RetryConfig( max_attempts=3, initial_delay=0.01, retryable_exceptions=(TimeoutError,), ) result = await retry_with_backoff(test_func, "test", config=config) assert result == "result: test" assert call_count == 2 def test_retryable_http_error(): """Test RetryableHTTPError.""" error = RetryableHTTPError(503, "Service Unavailable") assert error.status_code == 503 assert error.message == "Service Unavailable" assert str(error) == "HTTP 503: Service Unavailable" @pytest.mark.asyncio async def test_retry_with_delay_progression(): """Test that retries happen with correct delay progression.""" call_times = [] @with_retry( RetryConfig( max_attempts=3, initial_delay=0.1, exponential_base=2.0, jitter=False, retryable_exceptions=(ValueError,), ) ) async def test_func(): call_times.append(asyncio.get_event_loop().time()) if len(call_times) < 3: raise ValueError("Retry me") return "success" asyncio.get_event_loop().time() result = await test_func() assert result == "success" assert len(call_times) == 3 # Check delays between attempts (allowing some tolerance) first_delay = call_times[1] - call_times[0] second_delay = call_times[2] - call_times[1] assert 0.08 <= first_delay <= 0.12 # ~0.1s assert 0.18 <= second_delay <= 0.22 # ~0.2s @pytest.mark.asyncio async def test_integration_with_http_client(monkeypatch): """Test retry integration with HTTP client.""" from biomcp.http_client import call_http # Disable connection pooling for this test monkeypatch.setenv("BIOMCP_USE_CONNECTION_POOL", "false") # Test 1: Connection error retry with patch( "biomcp.http_client_simple.httpx.AsyncClient" ) as mock_client_class: mock_client = AsyncMock() mock_client_class.return_value = mock_client mock_client.aclose = AsyncMock() # Mock aclose method # Simulate connection errors then success call_count = 0 async def mock_get(*args, **kwargs): nonlocal call_count call_count += 1 if call_count < 3: raise httpx.ConnectError("Connection failed") # Return success on third try mock_response = MagicMock() mock_response.status_code = 200 mock_response.text = '{"result": "success"}' return mock_response mock_client.get = mock_get config = RetryConfig( max_attempts=3, initial_delay=0.01, ) status, content = await call_http( "GET", "https://api.example.com/test", {}, retry_config=config ) assert status == 200 assert content == '{"result": "success"}' assert call_count == 3 # Test 2: Timeout error retry with patch( "biomcp.http_client_simple.httpx.AsyncClient" ) as mock_client_class: mock_client = AsyncMock() mock_client_class.return_value = mock_client mock_client.aclose = AsyncMock() # Mock aclose method # Simulate timeout errors mock_client.get.side_effect = httpx.TimeoutException( "Request timed out" ) config = RetryConfig( max_attempts=2, initial_delay=0.01, ) # This should raise TimeoutError after retries fail with pytest.raises(TimeoutError): await call_http( "GET", "https://api.example.com/test", {}, retry_config=config ) assert mock_client.get.call_count == 2 ``` -------------------------------------------------------------------------------- /src/biomcp/circuit_breaker.py: -------------------------------------------------------------------------------- ```python """Circuit breaker pattern implementation for fault tolerance.""" import asyncio import enum import logging from collections.abc import Callable from dataclasses import dataclass, field from datetime import datetime from typing import Any logger = logging.getLogger(__name__) class CircuitState(enum.Enum): """Circuit breaker states.""" CLOSED = "closed" # Normal operation, requests pass through OPEN = "open" # Circuit tripped, requests fail fast HALF_OPEN = "half_open" # Testing if service recovered @dataclass class CircuitBreakerConfig: """Configuration for circuit breaker behavior.""" failure_threshold: int = 5 """Number of failures before opening circuit""" recovery_timeout: float = 60.0 """Seconds to wait before attempting recovery""" success_threshold: int = 2 """Successes needed in half-open state to close circuit""" expected_exception: type[Exception] | tuple[type[Exception], ...] = ( Exception ) """Exception types that count as failures""" exclude_exceptions: tuple[type[Exception], ...] = () """Exception types that don't count as failures""" @dataclass class CircuitBreakerState: """Mutable state for a circuit breaker.""" state: CircuitState = CircuitState.CLOSED failure_count: int = 0 success_count: int = 0 last_failure_time: datetime | None = None last_state_change: datetime = field(default_factory=datetime.now) _lock: asyncio.Lock = field(default_factory=asyncio.Lock) class CircuitBreakerError(Exception): """Raised when circuit breaker is open.""" def __init__( self, message: str, last_failure_time: datetime | None = None ): super().__init__(message) self.last_failure_time = last_failure_time class CircuitBreaker: """Circuit breaker implementation.""" def __init__( self, name: str, config: CircuitBreakerConfig | None = None, ): """Initialize circuit breaker. Args: name: Circuit breaker name for logging config: Configuration (uses defaults if not provided) """ self.name = name self.config = config or CircuitBreakerConfig() self._state = CircuitBreakerState() async def call( self, func: Callable[..., Any], *args: Any, **kwargs: Any, ) -> Any: """Execute function through circuit breaker. Args: func: Async function to execute *args: Positional arguments for func **kwargs: Keyword arguments for func Returns: Result of function call Raises: CircuitBreakerError: If circuit is open Exception: If function raises exception """ async with self._state._lock: # Check if we should transition from open to half-open if self._state.state == CircuitState.OPEN: if self._should_attempt_reset(): self._state.state = CircuitState.HALF_OPEN self._state.success_count = 0 self._state.last_state_change = datetime.now() logger.info( f"Circuit breaker '{self.name}' entering half-open state" ) else: raise CircuitBreakerError( f"Circuit breaker '{self.name}' is open", self._state.last_failure_time, ) # Execute the function try: result = await func(*args, **kwargs) await self._on_success() return result except Exception as exc: if await self._on_failure(exc): raise # If exception doesn't count as failure, re-raise it raise async def _on_success(self) -> None: """Handle successful call.""" async with self._state._lock: if self._state.state == CircuitState.HALF_OPEN: self._state.success_count += 1 if self._state.success_count >= self.config.success_threshold: self._state.state = CircuitState.CLOSED self._state.failure_count = 0 self._state.success_count = 0 self._state.last_state_change = datetime.now() logger.info( f"Circuit breaker '{self.name}' closed after recovery" ) elif self._state.state == CircuitState.CLOSED: # Reset failure count on success self._state.failure_count = 0 async def _on_failure(self, exc: Exception) -> bool: """Handle failed call. Args: exc: The exception that was raised Returns: True if exception counts as failure """ # Check if exception should be counted if not self._is_counted_exception(exc): return False async with self._state._lock: self._state.failure_count += 1 self._state.last_failure_time = datetime.now() if self._state.state == CircuitState.HALF_OPEN: # Single failure in half-open state reopens circuit self._state.state = CircuitState.OPEN self._state.last_state_change = datetime.now() logger.warning( f"Circuit breaker '{self.name}' reopened due to failure in half-open state" ) elif ( self._state.state == CircuitState.CLOSED and self._state.failure_count >= self.config.failure_threshold ): # Threshold exceeded, open circuit self._state.state = CircuitState.OPEN self._state.last_state_change = datetime.now() logger.error( f"Circuit breaker '{self.name}' opened after {self._state.failure_count} failures" ) return True def _should_attempt_reset(self) -> bool: """Check if enough time has passed to attempt reset.""" if self._state.last_failure_time is None: return True time_since_failure = datetime.now() - self._state.last_failure_time return ( time_since_failure.total_seconds() >= self.config.recovery_timeout ) def _is_counted_exception(self, exc: Exception) -> bool: """Check if exception should count as failure.""" # Check excluded exceptions first if isinstance(exc, self.config.exclude_exceptions): return False # Check expected exceptions return isinstance(exc, self.config.expected_exception) @property def state(self) -> CircuitState: """Get current circuit state.""" return self._state.state @property def is_open(self) -> bool: """Check if circuit is open.""" return self._state.state == CircuitState.OPEN @property def is_closed(self) -> bool: """Check if circuit is closed.""" return self._state.state == CircuitState.CLOSED async def reset(self) -> None: """Manually reset circuit to closed state.""" async with self._state._lock: self._state.state = CircuitState.CLOSED self._state.failure_count = 0 self._state.success_count = 0 self._state.last_failure_time = None self._state.last_state_change = datetime.now() logger.info(f"Circuit breaker '{self.name}' manually reset") # Global registry of circuit breakers _circuit_breakers: dict[str, CircuitBreaker] = {} def get_circuit_breaker( name: str, config: CircuitBreakerConfig | None = None, ) -> CircuitBreaker: """Get or create a circuit breaker. Args: name: Circuit breaker name config: Configuration (used only on creation) Returns: Circuit breaker instance """ if name not in _circuit_breakers: _circuit_breakers[name] = CircuitBreaker(name, config) return _circuit_breakers[name] def circuit_breaker( name: str | None = None, config: CircuitBreakerConfig | None = None, ): """Decorator to apply circuit breaker to function. Args: name: Circuit breaker name (defaults to function name) config: Circuit breaker configuration Returns: Decorated function """ def decorator(func): breaker_name = name or f"{func.__module__}.{func.__name__}" breaker = get_circuit_breaker(breaker_name, config) async def wrapper(*args, **kwargs): return await breaker.call(func, *args, **kwargs) # Preserve function metadata wrapper.__name__ = func.__name__ wrapper.__doc__ = func.__doc__ wrapper._circuit_breaker = breaker # Expose breaker for testing return wrapper return decorator ``` -------------------------------------------------------------------------------- /src/biomcp/articles/search.py: -------------------------------------------------------------------------------- ```python import asyncio import json from collections.abc import Generator from typing import Annotated, Any, get_args from pydantic import BaseModel, Field, computed_field from .. import http_client, render from ..constants import PUBTATOR3_SEARCH_URL, SYSTEM_PAGE_SIZE from ..core import PublicationState from .autocomplete import Concept, EntityRequest, autocomplete from .fetch import call_pubtator_api concepts: list[Concept] = sorted(get_args(Concept)) fields: list[str] = [concept + "s" for concept in concepts] class PubmedRequest(BaseModel): chemicals: list[str] = Field( default_factory=list, description="List of chemicals for filtering results.", ) diseases: list[str] = Field( default_factory=list, description="Diseases such as Hypertension, Lung Adenocarcinoma, etc.", ) genes: list[str] = Field( default_factory=list, description="List of genes for filtering results.", ) keywords: list[str] = Field( default_factory=list, description="List of other keywords for filtering results.", ) variants: list[str] = Field( default_factory=list, description="List of variants for filtering results.", ) def iter_concepts(self) -> Generator[tuple[Concept, str], None, None]: for concept in concepts: field = concept + "s" values = getattr(self, field, []) or [] for value in values: yield concept, value class PubtatorRequest(BaseModel): text: str size: int = 50 class ResultItem(BaseModel): pmid: int | None = None pmcid: str | None = None title: str | None = None journal: str | None = None authors: list[str] | None = None date: str | None = None doi: str | None = None abstract: str | None = None publication_state: PublicationState = PublicationState.PEER_REVIEWED source: str | None = Field( None, description="Source database (e.g., PubMed, bioRxiv, Europe PMC)" ) @computed_field def pubmed_url(self) -> str | None: url = None if self.pmid: url = f"https://pubmed.ncbi.nlm.nih.gov/{self.pmid}/" return url @computed_field def pmc_url(self) -> str | None: """Generates the PMC URL if PMCID exists.""" url = None if self.pmcid: url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{self.pmcid}/" return url @computed_field def doi_url(self) -> str | None: """Generates the DOI URL if DOI exists.""" url = None if self.doi: url = f"https://doi.org/{self.doi}" return url class SearchResponse(BaseModel): results: list[ResultItem] page_size: int current: int count: int total_pages: int async def convert_request(request: PubmedRequest) -> PubtatorRequest: query_parts = [] # Process keywords with OR logic support for keyword in request.keywords: if "|" in keyword: # Handle OR within a keyword (e.g., "R173|Arg173|p.R173") or_terms = [term.strip() for term in keyword.split("|")] or_query = "(" + " OR ".join(or_terms) + ")" query_parts.append(or_query) else: query_parts.append(keyword) # Create all autocomplete tasks in parallel autocomplete_tasks = [] concept_values = [] for concept, value in request.iter_concepts(): task = autocomplete( request=EntityRequest(concept=concept, query=value), ) autocomplete_tasks.append(task) concept_values.append((concept, value)) # Execute all autocomplete calls in parallel if autocomplete_tasks: entities = await asyncio.gather(*autocomplete_tasks) # Process results for (_concept, value), entity in zip( concept_values, entities, strict=False ): if entity: query_parts.append(entity.entity_id) else: query_parts.append(value) query_text = " AND ".join(query_parts) return PubtatorRequest(text=query_text, size=SYSTEM_PAGE_SIZE) async def add_abstracts(response: SearchResponse) -> None: pmids = [pr.pmid for pr in response.results if pr.pmid] abstract_response, _ = await call_pubtator_api(pmids, full=False) if abstract_response: for result in response.results: result.abstract = abstract_response.get_abstract(result.pmid) def clean_authors(record): """Keep only the first and last author if > 4 authors.""" authors = record.get("authors") if authors and len(authors) > 4: record["authors"] = [authors[0], "...", authors[-1]] return record async def search_articles( request: PubmedRequest, output_json: bool = False, ) -> str: pubtator_request = await convert_request(request) # Start the search request search_task = http_client.request_api( url=PUBTATOR3_SEARCH_URL, request=pubtator_request, response_model_type=SearchResponse, domain="article", ) # Execute search first response, error = await search_task if response: # Now fetch abstracts (still sequential but could be parallelized with other operations) await add_abstracts(response) # Add source field to PubMed results for result in response.results: result.source = "PubMed" # noinspection DuplicatedCode if error: data: list[dict[str, Any]] = [ {"error": f"Error {error.code}: {error.message}"} ] else: data = list( map( clean_authors, [ result.model_dump(mode="json", exclude_none=True) for result in (response.results if response else []) ], ) ) if data and not output_json: return render.to_markdown(data) else: return json.dumps(data, indent=2) async def _article_searcher( call_benefit: Annotated[ str, "Define and summarize why this function is being called and the intended benefit", ], chemicals: Annotated[ list[str] | str | None, "List of chemicals for filtering results" ] = None, diseases: Annotated[ list[str] | str | None, "Diseases such as Hypertension, Lung Adenocarcinoma, etc.", ] = None, genes: Annotated[ list[str] | str | None, "List of genes for filtering results" ] = None, keywords: Annotated[ list[str] | str | None, "List of other keywords for filtering results" ] = None, variants: Annotated[ list[str] | str | None, "List of variants for filtering results" ] = None, include_preprints: Annotated[ bool, "Include preprint articles from bioRxiv/medRxiv and Europe PMC" ] = True, include_cbioportal: Annotated[ bool, "Include cBioPortal cancer genomics summary when searching by gene", ] = True, ) -> str: """ Searches for articles across PubMed and preprint servers. Parameters: - call_benefit: Define and summarize why this function is being called and the intended benefit - chemicals: List of chemicals for filtering results - diseases: Diseases such as Hypertension, Lung Adenocarcinoma, etc. - genes: List of genes for filtering results - keywords: List of other keywords for filtering results - variants: List of variants for filtering results - include_preprints: Include results from preprint servers (default: True) - include_cbioportal: Include cBioPortal summaries for gene searches (default: True) Notes: - Use full terms ("Non-small cell lung carcinoma") over abbreviations ("NSCLC") - Use keywords to specify terms that don't fit in disease, gene ("EGFR"), chemical ("Cisplatin"), or variant ("BRAF V600E") categories - Parameters can be provided as lists or comma-separated strings - Results include both peer-reviewed and preprint articles by default - Keywords support OR logic using the pipe (|) separator: - Example: "R173|Arg173|p.R173" finds articles with any of these notations - Multiple keywords are still combined with AND logic Returns: Markdown formatted list of matching articles, sorted by date (newest first), with peer-reviewed articles listed before preprints. Limited to max 20 results (10 from each source) by default to optimize token usage. """ # Import here to avoid circular dependency from .search_optimized import article_searcher_optimized # Use the optimized version with caching return await article_searcher_optimized( call_benefit=call_benefit, chemicals=chemicals, diseases=diseases, genes=genes, keywords=keywords, variants=variants, include_preprints=include_preprints, include_cbioportal=include_cbioportal, ) ``` -------------------------------------------------------------------------------- /docs/FDA_SECURITY.md: -------------------------------------------------------------------------------- ```markdown # FDA Integration Security Documentation ## Overview This document outlines the security measures implemented in the BioMCP FDA integration to ensure safe handling of medical data and protection against common vulnerabilities. ## Security Features ### 1. Input Validation & Sanitization All user inputs are validated and sanitized before being sent to the FDA API: - **Injection Prevention**: Removes characters that could be used for SQL injection, XSS, or command injection (`<>\"';&|\\`) - **Length Limits**: Enforces maximum lengths on all input fields - **Type Validation**: Ensures parameters match expected types (dates, numbers, etc.) - **Format Validation**: Validates specific formats (e.g., YYYY-MM-DD for dates) **Implementation**: `src/biomcp/openfda/input_validation.py` ```python # Example usage from biomcp.openfda.input_validation import sanitize_input, validate_drug_name safe_drug = validate_drug_name("Aspirin<script>") # Returns "Aspirin" safe_input = sanitize_input("'; DROP TABLE;") # SQL injection blocked ``` ### 2. API Key Protection API keys are protected at multiple levels: - **Cache Key Exclusion**: API keys are removed before generating cache keys - **No Logging**: API keys are never logged, even in debug mode - **Environment Variables**: Keys stored in environment variables, not in code - **Validation**: API key format is validated before use **Implementation**: `src/biomcp/openfda/cache.py`, `src/biomcp/openfda/utils.py` ### 3. Rate Limiting Client-side rate limiting prevents API quota exhaustion: - **Token Bucket Algorithm**: Allows bursts while maintaining average rate - **Configurable Limits**: 40 requests/minute without key, 240 with key - **Concurrent Request Limiting**: Maximum 10 concurrent requests via semaphore - **Automatic Backoff**: Delays requests when approaching limits **Implementation**: `src/biomcp/openfda/rate_limiter.py` ### 4. Circuit Breaker Pattern Prevents cascading failures when FDA API is unavailable: - **Failure Threshold**: Opens after 5 consecutive failures - **Recovery Timeout**: Waits 60 seconds before retry attempts - **Half-Open State**: Tests recovery with limited requests - **Automatic Recovery**: Returns to normal operation when API recovers **States**: - **CLOSED**: Normal operation - **OPEN**: Blocking all requests (API is down) - **HALF_OPEN**: Testing if API has recovered ### 5. Memory Protection Prevents memory exhaustion from large responses: - **Response Size Limits**: Maximum 1MB per cached response - **Cache Size Limits**: Maximum 100 entries in cache - **FIFO Eviction**: Oldest entries removed when cache is full - **Size Validation**: Large responses rejected before caching **Configuration**: ```bash export BIOMCP_FDA_MAX_RESPONSE_SIZE=1048576 # 1MB export BIOMCP_FDA_MAX_CACHE_SIZE=100 ``` ### 6. File Operation Security Secure handling of cache files: - **File Locking**: Uses `fcntl` for exclusive/shared locks - **Atomic Operations**: Writes to temp files then renames - **Race Condition Prevention**: Locks prevent concurrent modifications - **Permission Control**: Files created without world-write permissions **Implementation**: `src/biomcp/openfda/drug_shortages.py` ## Security Best Practices ### For Developers 1. **Never Log Sensitive Data** ```python # BAD logger.debug(f"API key: {api_key}") # GOOD logger.debug("API key configured" if api_key else "No API key") ``` 2. **Always Validate Input** ```python from biomcp.openfda.input_validation import validate_drug_name # Always validate before using safe_drug = validate_drug_name(user_input) if safe_drug: # Use safe_drug, not user_input await search_adverse_events(drug=safe_drug) ``` 3. **Use Rate Limiting** ```python from biomcp.openfda.rate_limiter import rate_limited_request # Wrap API calls with rate limiting result = await rate_limited_request(make_api_call, params) ``` ### For System Administrators 1. **API Key Management** - Store API keys in environment variables - Rotate keys regularly (recommended: every 90 days) - Use different keys for dev/staging/production - Monitor key usage for anomalies 2. **Monitoring** - Set up alerts for circuit breaker state changes - Monitor rate limit consumption - Track cache hit/miss ratios - Log validation failures (potential attacks) 3. **Resource Limits** ```bash # Configure limits based on your environment export BIOMCP_FDA_CACHE_TTL=15 # Minutes export BIOMCP_FDA_MAX_CACHE_SIZE=100 export BIOMCP_FDA_MAX_RESPONSE_SIZE=1048576 # 1MB ``` ## Threat Model ### Threats Addressed | Threat | Mitigation | Implementation | | ------------------- | --------------------------- | ---------------------- | | SQL Injection | Input sanitization | `input_validation.py` | | XSS Attacks | HTML/JS character removal | `sanitize_input()` | | Command Injection | Shell metacharacter removal | `sanitize_input()` | | API Key Exposure | Exclusion from logs/cache | `cache.py`, `utils.py` | | DoS via Rate Limits | Client-side rate limiting | `rate_limiter.py` | | Cascading Failures | Circuit breaker pattern | `CircuitBreaker` class | | Memory Exhaustion | Response size limits | `MAX_RESPONSE_SIZE` | | Race Conditions | File locking | `fcntl` usage | | Cache Poisoning | Input validation | `build_safe_query()` | ### Residual Risks 1. **API Key Compromise**: If environment is compromised, keys are accessible - **Mitigation**: Use secret management systems in production 2. **Zero-Day FDA API Vulnerabilities**: Unknown vulnerabilities in FDA API - **Mitigation**: Monitor FDA security advisories 3. **Distributed DoS**: Multiple clients could still overwhelm FDA API - **Mitigation**: Implement global rate limiting at gateway level ## Compliance Considerations ### HIPAA (If Applicable) While FDA's public APIs don't contain PHI, if extended to include patient data: 1. **Encryption**: Use TLS for all API communications 2. **Audit Logging**: Log all data access (but not the data itself) 3. **Access Controls**: Implement user authentication/authorization 4. **Data Retention**: Define and enforce retention policies ### FDA Data Usage 1. **Attribution**: Always include FDA disclaimers in responses 2. **Data Currency**: Warn users that data may not be real-time 3. **Medical Decisions**: Explicitly state data is not for clinical decisions 4. **Rate Limits**: Respect FDA's terms of service ## Security Testing ### Automated Tests Run security tests with: ```bash pytest tests/tdd/openfda/test_security.py -v ``` Tests cover: - Input validation - Cache key security - Rate limiting - Circuit breaker - File operations ### Manual Security Review Checklist for security review: - [ ] No sensitive data in logs - [ ] All inputs validated - [ ] Rate limiting functional - [ ] Circuit breaker triggers correctly - [ ] Cache size limited - [ ] File operations are atomic - [ ] API keys not in cache keys - [ ] Error messages don't leak information ## Incident Response ### If API Key is Compromised 1. **Immediate**: Revoke compromised key at FDA portal 2. **Generate**: Create new API key 3. **Update**: Update environment variables 4. **Restart**: Restart services to load new key 5. **Audit**: Review logs for unauthorized usage ### If Rate Limits Exceeded 1. **Check**: Verify circuit breaker state 2. **Wait**: Allow circuit breaker recovery timeout 3. **Reduce**: Lower request rate if needed 4. **Monitor**: Check for abnormal usage patterns ### If Security Vulnerability Found 1. **Assess**: Determine severity and exploitability 2. **Patch**: Develop and test fix 3. **Deploy**: Roll out fix with monitoring 4. **Document**: Update this security documentation 5. **Notify**: Inform users if data was at risk ## Configuration Reference ### Environment Variables | Variable | Default | Description | | ------------------------------ | ------- | ---------------------------------- | | `OPENFDA_API_KEY` | None | FDA API key for higher rate limits | | `BIOMCP_FDA_CACHE_TTL` | 15 | Cache TTL in minutes | | `BIOMCP_FDA_MAX_CACHE_SIZE` | 100 | Maximum cache entries | | `BIOMCP_FDA_MAX_RESPONSE_SIZE` | 1048576 | Maximum response size in bytes | | `BIOMCP_SHORTAGE_CACHE_TTL` | 24 | Drug shortage cache TTL in hours | ### Security Headers When deploying as a web service, add these headers: ```python headers = { "X-Content-Type-Options": "nosniff", "X-Frame-Options": "DENY", "X-XSS-Protection": "1; mode=block", "Strict-Transport-Security": "max-age=31536000; includeSubDomains", "Content-Security-Policy": "default-src 'self'" } ``` ## Contact For security issues, contact: [email protected] (create this address) For FDA API issues, see: https://open.fda.gov/apis/ --- _Last Updated: 2025-08-07_ _Version: 1.0_ ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_cbioportal_search.py: -------------------------------------------------------------------------------- ```python """Test cBioPortal search enhancements.""" import asyncio import pytest from biomcp.variants.cbioportal_search import ( CBioPortalSearchClient, CBioPortalSearchSummary, format_cbioportal_search_summary, ) from biomcp.variants.search import VariantQuery, search_variants from .constants import API_RETRY_DELAY_SECONDS, DEFAULT_MAX_STUDIES class TestCBioPortalSearch: """Test cBioPortal search functionality.""" @pytest.mark.asyncio @pytest.mark.integration async def test_gene_search_summary(self): """Test getting gene search summary from cBioPortal.""" client = CBioPortalSearchClient() # Test with BRAF summary = await client.get_gene_search_summary("BRAF", max_studies=5) assert summary is not None assert summary.gene == "BRAF" # Handle case where cBioPortal API returns empty data if summary.total_mutations == 0: # API might be down or returning empty results # This is acceptable for integration tests assert summary.total_mutations == 0 assert summary.total_samples_tested == 0 assert summary.mutation_frequency == 0.0 assert len(summary.hotspots) == 0 else: # Normal case - data is available assert summary.total_mutations > 0 assert summary.total_samples_tested > 0 assert summary.mutation_frequency > 0 assert len(summary.hotspots) > 0 # Check that V600E is a top hotspot v600e_found = any( "V600E" in hs.amino_acid_change for hs in summary.hotspots ) assert v600e_found, "BRAF V600E should be a top hotspot" # Check cancer distribution if summary.total_mutations > 0: assert len(summary.cancer_distribution) > 0 assert any( "melanoma" in cancer.lower() for cancer in summary.cancer_distribution ), "BRAF should be found in melanoma" else: # When no mutations found, cancer distribution should be empty assert len(summary.cancer_distribution) == 0 @pytest.mark.asyncio @pytest.mark.integration async def test_format_search_summary(self): """Test formatting of search summary.""" # Create a mock summary summary = CBioPortalSearchSummary( gene="BRAF", total_mutations=1000, total_samples_tested=10000, mutation_frequency=0.1, hotspots=[ { "position": 600, "amino_acid_change": "V600E", "count": 800, "frequency": 0.8, "cancer_types": ["Melanoma", "Colorectal Cancer"], } ], cancer_distribution={"Melanoma": 600, "Colorectal Cancer": 200}, study_coverage={ "total_studies": 50, "queried_studies": 10, "studies_with_data": 8, }, ) formatted = format_cbioportal_search_summary(summary) assert "BRAF" in formatted assert "10.0%" in formatted # Mutation frequency assert "V600E" in formatted assert "Melanoma" in formatted assert "600 mutations" in formatted @pytest.mark.asyncio @pytest.mark.integration async def test_search_with_cbioportal_summary(self): """Test variant search with cBioPortal summary included.""" query = VariantQuery(gene="BRAF", size=5) result = await search_variants(query, include_cbioportal=True) # Should include cBioPortal summary section assert "cBioPortal Summary for BRAF" in result assert "Mutation Frequency" in result # Top Hotspots only appears when mutations are found # Check for either Top Hotspots or 0 mutations message assert "Top Hotspots" in result or "0 mutations" in result # Should still include variant results assert "# Record" in result or "No variants found" in result @pytest.mark.asyncio @pytest.mark.integration async def test_search_without_gene(self): """Test that cBioPortal summary is not included without gene parameter.""" query = VariantQuery(rsid="rs113488022", size=5) result = await search_variants(query, include_cbioportal=True) # Should not include cBioPortal summary assert "cBioPortal Summary" not in result @pytest.mark.asyncio @pytest.mark.integration async def test_tp53_search_summary(self): """Test TP53 gene search summary.""" client = CBioPortalSearchClient() # Clear any caches to ensure fresh data from biomcp.utils.request_cache import clear_cache await clear_cache() summary = await client.get_gene_search_summary("TP53", max_studies=5) assert summary is not None assert summary.gene == "TP53" # If we got no mutations, it might be a temporary API issue if summary.total_mutations == 0 and summary.total_samples_tested == 0: # Try one more time with a small delay await asyncio.sleep(API_RETRY_DELAY_SECONDS) summary = await client.get_gene_search_summary( "TP53", max_studies=5 ) # If still no data, skip the test rather than fail if summary.total_mutations == 0: pytest.skip( "cBioPortal returned no mutation data for TP53 - possible API issue" ) # Basic checks that should pass when data is available assert ( summary.total_mutations > 0 ), f"TP53 should have mutations. Got: {summary}" # More flexible checks if summary.hotspots: # Just verify structure if we have hotspots hotspot_changes = [hs.amino_acid_change for hs in summary.hotspots] print(f"TP53 hotspots found: {hotspot_changes[:5]}") assert ( len(hotspot_changes) >= 1 ), "Should find at least one TP53 hotspot" @pytest.mark.asyncio @pytest.mark.integration async def test_kras_search_summary(self): """Test KRAS gene search summary. This test verifies basic functionality rather than specific hotspots, which can change as cBioPortal data is updated. """ client = CBioPortalSearchClient() # Clear any caches to ensure fresh data from biomcp.utils.request_cache import clear_cache await clear_cache() summary = await client.get_gene_search_summary( "KRAS", max_studies=DEFAULT_MAX_STUDIES ) assert summary is not None, "Failed to get summary for KRAS" assert summary.gene == "KRAS" # If we got no mutations, it might be a temporary API issue if summary.total_mutations == 0 and summary.total_samples_tested == 0: # Try one more time with a small delay await asyncio.sleep(API_RETRY_DELAY_SECONDS) summary = await client.get_gene_search_summary( "KRAS", max_studies=DEFAULT_MAX_STUDIES ) # If still no data, skip the test rather than fail if summary.total_mutations == 0: pytest.skip( "cBioPortal returned no mutation data for KRAS - possible API issue" ) # Basic checks that should pass when data is available assert ( summary.total_mutations > 0 ), f"KRAS should have mutations. Got: {summary}" # More flexible checks if summary.hotspots: # Just verify structure if we have hotspots for hotspot in summary.hotspots[:3]: assert hasattr(hotspot, "amino_acid_change") assert hasattr(hotspot, "count") print( f"Top KRAS hotspots: {[hs.amino_acid_change for hs in summary.hotspots[:5]]}" ) # Cancer distribution check - only if we have data if summary.total_mutations > 0: assert ( len(summary.cancer_distribution) > 0 ), "Should have cancer type distribution" @pytest.mark.asyncio @pytest.mark.integration async def test_invalid_gene(self): """Test handling of invalid gene name.""" client = CBioPortalSearchClient() summary = await client.get_gene_search_summary("INVALID_GENE") assert summary is None @pytest.mark.asyncio @pytest.mark.integration async def test_json_output_with_cbioportal(self): """Test JSON output includes cBioPortal summary.""" query = VariantQuery(gene="BRAF", size=2) result = await search_variants( query, output_json=True, include_cbioportal=True ) # Parse JSON import json data = json.loads(result) # Should have both summary and variants assert "cbioportal_summary" in data assert "variants" in data assert "BRAF" in data["cbioportal_summary"] ``` -------------------------------------------------------------------------------- /tests/tdd/articles/test_unified.py: -------------------------------------------------------------------------------- ```python """Tests for unified article search functionality.""" import json from unittest.mock import AsyncMock, patch import pytest from biomcp.articles.search import PubmedRequest from biomcp.articles.unified import ( _deduplicate_articles, _parse_search_results, search_articles_unified, ) class TestUnifiedSearch: """Test unified search functionality.""" @pytest.fixture def pubmed_results(self): """Sample PubMed results in JSON format.""" return json.dumps([ { "pmid": 12345, "title": "BRAF mutations in cancer", "doi": "10.1234/test1", "date": "2024-01-15", "publication_state": "peer_reviewed", }, { "pmid": 12346, "title": "Another cancer study", "doi": "10.1234/test2", "date": "2024-01-10", "publication_state": "peer_reviewed", }, ]) @pytest.fixture def preprint_results(self): """Sample preprint results in JSON format.""" return json.dumps([ { "title": "BRAF preprint study", "doi": "10.1101/2024.01.20.123456", "date": "2024-01-20", "publication_state": "preprint", "source": "bioRxiv", }, { "title": "Duplicate study", "doi": "10.1234/test1", # Same DOI as PubMed result "date": "2024-01-14", "publication_state": "preprint", "source": "Europe PMC", }, ]) @pytest.mark.asyncio async def test_search_articles_unified_both_sources( self, pubmed_results, preprint_results ): """Test searching with both PubMed and preprints enabled.""" request = PubmedRequest(genes=["BRAF"]) mock_pubmed = AsyncMock(return_value=pubmed_results) mock_preprints = AsyncMock(return_value=preprint_results) with ( patch("biomcp.articles.unified.search_articles", mock_pubmed), patch("biomcp.articles.unified.search_preprints", mock_preprints), patch( "biomcp.variants.cbioportal_search.CBioPortalSearchClient" ) as mock_cbio, ): # Mock cBioPortal client to return None (no summary) mock_cbio.return_value.get_gene_search_summary = AsyncMock( return_value=None ) result = await search_articles_unified( request, include_pubmed=True, include_preprints=True, output_json=True, ) # Parse result data = json.loads(result) # When gene is specified but cBioPortal returns no data, # we should just get the articles list if isinstance(data, dict): articles = data.get("articles", data) else: articles = data # Should have 3 articles (one duplicate removed) assert len(articles) == 3 # Check ordering - peer reviewed should come first # Sort is by (publication_state priority, date DESC) # The test data has preprint with newer date, so it might come first # Let's just check we have the right mix states = [a["publication_state"] for a in articles] assert states.count("peer_reviewed") == 2 assert states.count("preprint") == 1 # Check deduplication worked dois = [a.get("doi") for a in articles if a.get("doi")] assert len(dois) == len(set(dois)) # No duplicate DOIs @pytest.mark.asyncio async def test_search_articles_unified_pubmed_only(self, pubmed_results): """Test searching with only PubMed enabled.""" request = PubmedRequest( keywords=["cancer"] ) # No gene, so no cBioPortal with ( patch("biomcp.articles.unified.search_articles") as mock_pubmed, patch( "biomcp.articles.unified.search_preprints" ) as mock_preprints, ): mock_pubmed.return_value = pubmed_results result = await search_articles_unified( request, include_pubmed=True, include_preprints=False, output_json=True, ) # Preprints should not be called mock_preprints.assert_not_called() # Parse result articles = json.loads(result) assert len(articles) == 2 assert all( a["publication_state"] == "peer_reviewed" for a in articles ) @pytest.mark.asyncio async def test_search_articles_unified_preprints_only( self, preprint_results ): """Test searching with only preprints enabled.""" request = PubmedRequest( keywords=["cancer"] ) # No gene, so no cBioPortal with ( patch("biomcp.articles.unified.search_articles") as mock_pubmed, patch( "biomcp.articles.unified.search_preprints" ) as mock_preprints, ): mock_preprints.return_value = preprint_results result = await search_articles_unified( request, include_pubmed=False, include_preprints=True, output_json=True, ) # PubMed should not be called mock_pubmed.assert_not_called() # Parse result articles = json.loads(result) assert len(articles) == 2 assert all(a["publication_state"] == "preprint" for a in articles) @pytest.mark.asyncio async def test_search_articles_unified_error_handling(self): """Test error handling when one source fails.""" request = PubmedRequest( keywords=["cancer"] ) # No gene, so no cBioPortal with ( patch("biomcp.articles.unified.search_articles") as mock_pubmed, patch( "biomcp.articles.unified.search_preprints" ) as mock_preprints, ): # PubMed succeeds mock_pubmed.return_value = json.dumps([{"title": "Success"}]) # Preprints fails mock_preprints.side_effect = Exception("API Error") result = await search_articles_unified( request, include_pubmed=True, include_preprints=True, output_json=True, ) # Should still get PubMed results articles = json.loads(result) assert len(articles) == 1 assert articles[0]["title"] == "Success" @pytest.mark.asyncio async def test_search_articles_unified_markdown_output( self, pubmed_results ): """Test markdown output format.""" request = PubmedRequest(genes=["BRAF"]) mock_pubmed = AsyncMock(return_value=pubmed_results) with patch("biomcp.articles.unified.search_articles", mock_pubmed): result = await search_articles_unified( request, include_pubmed=True, include_preprints=False, output_json=False, ) # Should return markdown assert isinstance(result, str) assert "BRAF mutations in cancer" in result assert "# Record" in result # Markdown headers def test_deduplicate_articles(self): """Test article deduplication logic.""" articles = [ {"title": "Article 1", "doi": "10.1234/test1"}, {"title": "Article 2", "doi": "10.1234/test2"}, {"title": "Duplicate of 1", "doi": "10.1234/test1"}, {"title": "No DOI article"}, {"title": "Another no DOI"}, ] deduped = _deduplicate_articles(articles) # Should have 4 articles (one duplicate removed) assert len(deduped) == 4 # Check DOIs are unique dois = [a.get("doi") for a in deduped if a.get("doi")] assert len(dois) == len(set(dois)) # Articles without DOI should be preserved no_doi_count = sum(1 for a in deduped if not a.get("doi")) assert no_doi_count == 2 def test_parse_search_results(self): """Test parsing of search results from multiple sources.""" results = [ json.dumps([{"title": "Article 1"}, {"title": "Article 2"}]), json.dumps([{"title": "Article 3"}]), Exception("Failed source"), # Should be skipped "[invalid json", # Should be skipped ] parsed = _parse_search_results(results) # Should have 3 articles (2 + 1, skipping errors) assert len(parsed) == 3 assert parsed[0]["title"] == "Article 1" assert parsed[1]["title"] == "Article 2" assert parsed[2]["title"] == "Article 3" def test_parse_search_results_empty(self): """Test parsing with all empty/failed results.""" results = [ Exception("Failed"), "[invalid", json.dumps([]), # Empty list ] parsed = _parse_search_results(results) assert parsed == [] ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/device_events.py: -------------------------------------------------------------------------------- ```python """ OpenFDA Device Adverse Events (MAUDE) integration. Focus on genomic/diagnostic devices relevant to precision oncology. """ import logging from .constants import ( GENOMIC_DEVICE_PRODUCT_CODES, OPENFDA_DEFAULT_LIMIT, OPENFDA_DEVICE_EVENTS_URL, OPENFDA_DISCLAIMER, OPENFDA_MAX_LIMIT, ) from .device_events_helpers import ( analyze_device_problems, format_detailed_device_info, format_device_detail_header, format_device_distribution, format_device_report_summary, format_patient_details, format_top_problems, ) from .utils import clean_text, format_count, make_openfda_request logger = logging.getLogger(__name__) def _build_device_search_query( device: str | None, manufacturer: str | None, problem: str | None, product_code: str | None, genomics_only: bool, ) -> str: """Build the search query for device events.""" search_parts = [] if device: # Build flexible search queries device_queries = [] # First try exact match device_queries.extend([ f'device.brand_name:"{device}"', f'device.generic_name:"{device}"', f'device.openfda.device_name:"{device}"', ]) # For multi-word terms, also search for key words with wildcards # This helps match "FoundationOne CDx" to "F1CDX" or similar variations words = device.split() # If it's a multi-word query, add wildcard searches for significant words for word in words: # Skip common words and very short ones if len(word) > 3 and word.lower() not in [ "test", "system", "device", ]: # Use prefix wildcard for better performance device_queries.append(f"device.brand_name:{word}*") device_queries.append(f"device.generic_name:{word}*") # Also try searching by removing spaces (e.g., "Foundation One" -> "FoundationOne") if len(words) > 1: combined = "".join(words) device_queries.append(f'device.brand_name:"{combined}"') device_queries.append(f'device.generic_name:"{combined}"') search_parts.append(f"({' OR '.join(device_queries)})") if manufacturer: # Search manufacturer field with both exact and wildcard matching mfr_queries = [ f'device.manufacturer_d_name:"{manufacturer}"', f"device.manufacturer_d_name:*{manufacturer}*", ] search_parts.append(f"({' OR '.join(mfr_queries)})") if problem: search_parts.append(f'device.device_problem_text:"{problem}"') if product_code: search_parts.append(f'device.openfda.product_code:"{product_code}"') elif ( genomics_only and not device ): # Only apply genomics filter if no specific device is named # Filter to genomic device product codes code_parts = [ f'device.openfda.product_code:"{code}"' for code in GENOMIC_DEVICE_PRODUCT_CODES ] if code_parts: search_parts.append(f"({' OR '.join(code_parts)})") return " AND ".join(search_parts) def _format_search_summary( device: str | None, manufacturer: str | None, problem: str | None, genomics_only: bool, total: int, ) -> list[str]: """Format the search summary section.""" output = [] search_desc = [] if device: search_desc.append(f"**Device**: {device}") if manufacturer: search_desc.append(f"**Manufacturer**: {manufacturer}") if problem: search_desc.append(f"**Problem**: {problem}") if genomics_only: search_desc.append("**Type**: Genomic/Diagnostic Devices") if search_desc: output.append(" | ".join(search_desc)) output.append( f"**Total Reports Found**: {format_count(total, 'report')}\n" ) return output async def search_device_events( device: str | None = None, manufacturer: str | None = None, problem: str | None = None, product_code: str | None = None, genomics_only: bool = True, limit: int = OPENFDA_DEFAULT_LIMIT, skip: int = 0, api_key: str | None = None, ) -> str: """ Search FDA device adverse event reports (MAUDE). Args: device: Device name to search for manufacturer: Manufacturer name problem: Device problem description product_code: FDA product code genomics_only: Filter to genomic/diagnostic devices only limit: Maximum number of results skip: Number of results to skip api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) Returns: Formatted string with device event information """ if not device and not manufacturer and not product_code and not problem: return ( "⚠️ Please specify a device name, manufacturer, or problem to search.\n\n" "Examples:\n" "- Search by device: --device 'FoundationOne'\n" "- Search by manufacturer: --manufacturer 'Illumina'\n" "- Search by problem: --problem 'false positive'" ) # Build and execute search search_query = _build_device_search_query( device, manufacturer, problem, product_code, genomics_only ) params = { "search": search_query, "limit": min(limit, OPENFDA_MAX_LIMIT), "skip": skip, } response, error = await make_openfda_request( OPENFDA_DEVICE_EVENTS_URL, params, "openfda_device_events", api_key ) if error: return f"⚠️ Error searching device events: {error}" if not response or not response.get("results"): return _format_no_results(device, manufacturer, problem, genomics_only) results = response["results"] total = ( response.get("meta", {}).get("results", {}).get("total", len(results)) ) # Build output output = ["## FDA Device Adverse Event Reports\n"] output.extend( _format_search_summary( device, manufacturer, problem, genomics_only, total ) ) # Analyze and format problems all_problems, all_device_names, _ = analyze_device_problems(results) output.extend(format_top_problems(all_problems, results)) # Show device distribution if searching by problem if problem: output.extend(format_device_distribution(all_device_names, results)) # Display sample reports output.append( f"### Sample Reports (showing {min(len(results), 3)} of {total}):\n" ) for i, result in enumerate(results[:3], 1): output.extend(format_device_report_summary(result, i)) # Add tips if genomics_only: output.append( "\n💡 **Note**: Results filtered to genomic/diagnostic devices. " "Use --no-genomics-only to search all medical devices." ) output.append(f"\n{OPENFDA_DISCLAIMER}") return "\n".join(output) def _format_no_results( device: str | None, manufacturer: str | None, problem: str | None, genomics_only: bool, ) -> str: """Format no results message.""" search_desc = [] if device: search_desc.append(f"device '{device}'") if manufacturer: search_desc.append(f"manufacturer '{manufacturer}'") if problem: search_desc.append(f"problem '{problem}'") desc = " and ".join(search_desc) if genomics_only: desc += " (filtered to genomic/diagnostic devices)" return f"No device adverse event reports found for {desc}." async def get_device_event( mdr_report_key: str, api_key: str | None = None ) -> str: """ Get detailed information for a specific device event report. Args: mdr_report_key: MDR report key api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) Returns: Formatted string with detailed report information """ params = { "search": f'mdr_report_key:"{mdr_report_key}"', "limit": 1, } response, error = await make_openfda_request( OPENFDA_DEVICE_EVENTS_URL, params, "openfda_device_event_detail", api_key, ) if error: return f"⚠️ Error retrieving device event report: {error}" if not response or not response.get("results"): return f"Device event report '{mdr_report_key}' not found." result = response["results"][0] # Build detailed output output = format_device_detail_header(result, mdr_report_key) # Device details if devices := result.get("device", []): output.extend(format_detailed_device_info(devices)) # Event narrative if event_desc := result.get("event_description"): output.append("### Event Description") output.append(clean_text(event_desc)) output.append("") # Manufacturer narrative if mfr_narrative := result.get("manufacturer_narrative"): output.append("### Manufacturer's Analysis") output.append(clean_text(mfr_narrative)) output.append("") # Patient information if patient := result.get("patient", []): output.extend(format_patient_details(patient)) # Remedial action if remedial := result.get("remedial_action"): output.append("### Remedial Action") if isinstance(remedial, list): output.append(", ".join(remedial)) else: output.append(remedial) output.append("") output.append(f"\n{OPENFDA_DISCLAIMER}") return "\n".join(output) ``` -------------------------------------------------------------------------------- /docs/troubleshooting.md: -------------------------------------------------------------------------------- ```markdown # Troubleshooting Guide This guide helps you resolve common issues with BioMCP installation, configuration, and usage. ## Installation Issues ### Prerequisites Not Met **macOS:** ```bash # Install uv (recommended) brew install uv # Or using the official installer curl -LsSf https://astral.sh/uv/install.sh | sh # Install Node.js for npx (if needed) brew install node ``` **Linux:** ```bash # Install uv curl -LsSf https://astral.sh/uv/install.sh | sh # Install Node.js curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - sudo apt-get install -y nodejs ``` **Windows:** ```powershell # Install uv powershell -c "irm https://astral.sh/uv/install.ps1 | iex" # Install Node.js from https://nodejs.org ``` ### "Command not found" Error After installing BioMCP, if you get "command not found": 1. **Restart your terminal** - PATH updates require a new session 2. **Check installation location:** ```bash # For uv tool install ls ~/.local/bin/biomcp # For pip install which biomcp ``` 3. **Add to PATH manually:** ```bash # Add to ~/.bashrc or ~/.zshrc export PATH="$HOME/.local/bin:$PATH" ``` 4. **Reinstall with force:** ```bash uv tool install biomcp --force ``` 5. **Use full path:** ```bash ~/.local/bin/biomcp --version ``` ### Python Version Issues BioMCP requires Python 3.10 or higher: ```bash # Check Python version python --version # If too old, install newer version # macOS brew install [email protected] # Linux sudo apt update sudo apt install python3.11 # Use pyenv for version management pyenv install 3.11.8 pyenv local 3.11.8 ``` ## Configuration Issues ### API Key Not Working **Environment Variable Not Set:** ```bash # Check if set echo $NCI_API_KEY # Set temporarily export NCI_API_KEY="your-key-here" # Set permanently in ~/.bashrc or ~/.zshrc echo 'export NCI_API_KEY="your-key-here"' >> ~/.bashrc source ~/.bashrc ``` **Wrong API Key Format:** - NCI keys: Should be 36 characters (UUID format) - AlphaGenome: Alphanumeric string - cBioPortal: JWT token format **API Key Permissions:** ```bash # Test NCI API key biomcp health check --verbose # Test specific API curl -H "X-API-KEY: $NCI_API_KEY" \ "https://cts.nlm.nih.gov/api/v2/trials?size=1" ``` ### SSL Certificate Errors **Update certificates:** ```bash # Python certificates pip install --upgrade certifi # System certificates (macOS) brew install ca-certificates # System certificates (Linux) sudo apt-get update sudo apt-get install ca-certificates ``` **Corporate proxy issues:** ```bash # Set proxy environment variables export HTTP_PROXY="http://proxy.company.com:8080" export HTTPS_PROXY="http://proxy.company.com:8080" export NO_PROXY="localhost,127.0.0.1" # Configure pip for proxy pip config set global.proxy http://proxy.company.com:8080 ``` ## Search Issues ### No Results Found **1. Check gene symbol:** ```bash # Wrong: common names biomcp article search --gene HER2 # ❌ # Correct: official HGNC symbol biomcp article search --gene ERBB2 # ✅ # Find correct symbol biomcp gene get HER2 # Will suggest ERBB2 ``` **2. Too restrictive filters:** ```bash # Too specific - may return nothing biomcp article search --gene BRAF --disease "stage IV melanoma" \ --chemical "dabrafenib and trametinib combination" # Better - broader search biomcp article search --gene BRAF --disease melanoma \ --keyword "dabrafenib trametinib" ``` **3. Check data availability:** ```bash # Test if gene exists in database biomcp gene get YOUR_GENE # Test if disease term is recognized biomcp disease get "your disease term" ``` ### Location Search Not Working Location searches require coordinates: ```bash # Wrong - city name only biomcp trial search --condition cancer --city "New York" # ❌ # Correct - with coordinates biomcp trial search --condition cancer \ --latitude 40.7128 --longitude -74.0060 --distance 50 # ✅ ``` Common coordinates: - New York: 40.7128, -74.0060 - Los Angeles: 34.0522, -118.2437 - Chicago: 41.8781, -87.6298 - Houston: 29.7604, -95.3698 - Boston: 42.3601, -71.0589 ### Preprint Search Issues **Preprints not appearing:** ```bash # Check if preprints are being excluded biomcp article search --gene BRAF --no-preprints # Excludes preprints # Include preprints (default) biomcp article search --gene BRAF # Includes preprints ``` **DOI not found:** ```bash # Ensure correct DOI format biomcp article get "10.1101/2024.01.20.23288905" # bioRxiv format # Not all preprints are indexed immediately # Try searching by title/keywords instead ``` ## Performance Issues ### Slow Searches **1. Reduce result count:** ```bash # Default may be too high biomcp article search --gene TP53 --limit 100 # Slow # Reduce for faster results biomcp article search --gene TP53 --limit 10 # Fast ``` **2. Use specific filters:** ```bash # Broad search - slow biomcp trial search --condition cancer # Specific search - faster biomcp trial search --condition "melanoma" --phase PHASE3 \ --status RECRUITING --country "United States" ``` **3. Check API health:** ```bash # See which APIs are slow biomcp health check --verbose # Check specific API biomcp health check --apis-only ``` ### Timeout Errors **Increase timeout for slow networks:** ```bash # Set environment variable export BIOMCP_TIMEOUT=300 # 5 minutes # Or use configuration file echo "timeout: 300" > ~/.biomcp/config.yml ``` **For specific operations:** ```python # In Python scripts import asyncio asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) ``` ### Memory Issues **Large result sets:** ```bash # Process in batches for i in {1..10}; do biomcp article search --gene BRCA1 --page $i --limit 100 done # Use streaming where available biomcp article search --gene TP53 --format jsonl | \ while read line; do echo "$line" | jq '.pmid' done ``` ## MCP Server Issues ### Testing Server Connectivity **1. Test with MCP Inspector:** ```bash npx @modelcontextprotocol/inspector uv run --with biomcp-python biomcp run ``` Open http://127.0.0.1:6274 and verify: - Tools list loads - Can invoke a simple tool like `gene_getter` **2. Test with curl (HTTP mode):** ```bash # Start server in HTTP mode biomcp run --mode http --port 8000 # Test health endpoint curl http://localhost:8000/health # Test MCP endpoint curl -X POST http://localhost:8000/mcp \ -H "Content-Type: application/json" \ -d '{"method": "tools/list"}' ``` ### Claude Desktop Integration Issues **Server not appearing:** 1. Check configuration file location: - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json` - Windows: `%APPDATA%\Claude\claude_desktop_config.json` 2. Validate JSON syntax: ```bash # macOS cat ~/Library/Application\ Support/Claude/claude_desktop_config.json | jq . ``` 3. Check server starts correctly: ```bash # Test the exact command from config uv run --with biomcp-python biomcp run ``` **Server crashes:** Check logs: ```bash # Enable debug logging export BIOMCP_LOG_LEVEL=DEBUG uv run --with biomcp-python biomcp run ``` Common fixes: - Update to latest version: `uv tool install biomcp --force` - Clear cache: `rm -rf ~/.biomcp/cache` - Check port conflicts: `lsof -i :8000` ## Data Quality Issues ### Outdated Results **Check data freshness:** ```bash # See when databases were last updated biomcp health check --verbose | grep "Last updated" ``` **Clear cache if needed:** ```bash # Remove cached results rm -rf ~/.biomcp/cache # Or set cache TTL export BIOMCP_CACHE_TTL=900 # 15 minutes ``` ### Missing Annotations **PubTator3 annotations missing:** - Some newer articles may not be fully annotated yet - Try searching by PMID directly - Check if article is indexed: search by title **Variant annotations incomplete:** - Not all variants have all annotation types - Rare variants may lack population frequencies - Novel variants won't have ClinVar data ## Error Messages ### Common Error Codes **HTTP 429 - Rate Limit Exceeded:** ```bash # Add delay between requests biomcp article search --gene BRAF --delay 1000 # 1 second # Or reduce parallel requests export BIOMCP_MAX_CONCURRENT=2 ``` **HTTP 404 - Not Found:** - Check identifier format (PMID, NCT ID, etc.) - Verify record exists in source database - Try alternative identifiers **HTTP 500 - Server Error:** - External API may be down - Check status: `biomcp health check` - Try again later ### Debugging **Enable verbose logging:** ```bash # Set log level export BIOMCP_LOG_LEVEL=DEBUG # Run with verbose output biomcp article search --gene BRAF --verbose # Check log files tail -f ~/.biomcp/logs/biomcp.log ``` **Report bugs:** Include when reporting issues: 1. BioMCP version: `biomcp --version` 2. Full error message and stack trace 3. Command that caused the error 4. Operating system and Python version 5. Relevant environment variables Report at: https://github.com/genomoncology/biomcp/issues ## Getting Help ### Quick Checks 1. **Check FAQ first**: [Frequently Asked Questions](faq-condensed.md) 2. **Search existing issues**: [GitHub Issues](https://github.com/genomoncology/biomcp/issues) 3. **Check examples**: [How-to Guides](how-to-guides/01-find-articles-and-cbioportal-data.md) ### Community Support - Issue Tracker: Report bugs, request features - Documentation: PRs welcome for improvements ### Professional Support For commercial support, contact: [email protected] --- _Still having issues? [Open a GitHub issue](https://github.com/genomoncology/biomcp/issues/new) with details._ ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_external_integration.py: -------------------------------------------------------------------------------- ```python """Integration tests for external variant data sources with real API calls.""" import pytest from biomcp.variants.cbio_external_client import CBioPortalExternalClient from biomcp.variants.external import ( ExternalVariantAggregator, TCGAClient, ThousandGenomesClient, ) class TestTCGAIntegration: """Integration tests for TCGA/GDC API.""" @pytest.mark.asyncio @pytest.mark.integration async def test_braf_v600e_variant(self): """Test fetching BRAF V600E data from TCGA.""" client = TCGAClient() # Try different formats variants_to_test = [ "BRAF V600E", # Gene AA change format that TCGA supports "chr7:g.140453136A>T", "7:g.140453136A>T", ] found_data = False for variant in variants_to_test: result = await client.get_variant_data(variant) if result: found_data = True # BRAF V600E is common in melanoma and thyroid cancer assert result.tumor_types is not None assert len(result.tumor_types) > 0 # Should have affected cases if data found if result.affected_cases: assert result.affected_cases > 0 break # Note: TCGA might not have data for all variants if not found_data: pytest.skip("TCGA API did not return data for BRAF V600E variants") @pytest.mark.asyncio @pytest.mark.integration async def test_tp53_variant(self): """Test fetching TP53 variant data from TCGA.""" client = TCGAClient() # TP53 R273H - common tumor suppressor mutation result = await client.get_variant_data("chr17:g.7577120G>A") # TP53 mutations are very common in cancer if result: assert result.tumor_types is not None assert len(result.tumor_types) > 0 @pytest.mark.asyncio @pytest.mark.integration async def test_nonexistent_variant(self): """Test TCGA response for non-existent variant.""" client = TCGAClient() # Made-up variant that shouldn't exist result = await client.get_variant_data("chr99:g.999999999A>T") assert result is None class TestThousandGenomesIntegration: """Integration tests for 1000 Genomes via Ensembl REST API.""" @pytest.mark.asyncio @pytest.mark.integration async def test_common_variant_with_rsid(self): """Test fetching common variant data by rsID.""" client = ThousandGenomesClient() # rs113488022 is BRAF V600E result = await client.get_variant_data("rs113488022") if result: # This is a rare variant, so MAF should be low or None if result.global_maf is not None: assert result.global_maf < 0.01 # Less than 1% # Consequence information might not be available for all variants # Just verify the data structure is correct assert hasattr(result, "most_severe_consequence") @pytest.mark.asyncio @pytest.mark.integration async def test_variant_population_frequencies(self): """Test population frequency data retrieval.""" client = ThousandGenomesClient() # Use a more common variant for testing population frequencies # rs1800734 - common variant in MLH1 promoter result = await client.get_variant_data("rs1800734") if result: # Should have at least global MAF assert result.global_maf is not None assert 0 <= result.global_maf <= 1 # Check that we get population-specific frequencies pop_freqs = [ result.afr_maf, result.amr_maf, result.eas_maf, result.eur_maf, result.sas_maf, ] # At least some populations should have data non_null_freqs = [f for f in pop_freqs if f is not None] assert len(non_null_freqs) > 0 @pytest.mark.asyncio @pytest.mark.integration async def test_invalid_variant_id(self): """Test 1000 Genomes response for invalid variant.""" client = ThousandGenomesClient() # Invalid rsID result = await client.get_variant_data("rs999999999999") assert result is None class TestCBioPortalIntegration: """Integration tests for cBioPortal API.""" @pytest.mark.asyncio @pytest.mark.integration async def test_braf_v600e_variant(self): """Test fetching BRAF V600E data from cBioPortal.""" client = CBioPortalExternalClient() result = await client.get_variant_data("BRAF V600E") if result: # BRAF V600E is common in melanoma and other cancers assert result.total_cases is not None assert result.total_cases > 0 assert len(result.studies) > 0 # Should have data from various studies print( f"Found {result.total_cases} cases in {len(result.studies)} studies: {result.studies}" ) # Check enhanced fields assert result.cancer_type_distribution is not None assert len(result.cancer_type_distribution) > 0 print( f"Cancer types: {list(result.cancer_type_distribution.keys())}" ) assert result.mutation_types is not None assert "Missense_Mutation" in result.mutation_types assert result.mean_vaf is not None print(f"Mean VAF: {result.mean_vaf}") else: pytest.skip("cBioPortal API did not return data for BRAF V600E") @pytest.mark.asyncio @pytest.mark.integration async def test_kras_g12d_variant(self): """Test fetching KRAS G12D data from cBioPortal.""" client = CBioPortalExternalClient() result = await client.get_variant_data("KRAS G12D") if result: # KRAS G12D is a common mutation in multiple cancer types assert result.total_cases is not None assert result.total_cases > 0 assert len(result.studies) > 0 else: pytest.skip("cBioPortal API did not return data for KRAS G12D") @pytest.mark.asyncio @pytest.mark.integration async def test_invalid_variant(self): """Test cBioPortal response for invalid variant.""" client = CBioPortalExternalClient() # Invalid gene name result = await client.get_variant_data("FAKEGENE V600E") assert result is None class TestExternalVariantAggregatorIntegration: """Integration tests for the external variant aggregator.""" @pytest.mark.asyncio @pytest.mark.integration async def test_aggregate_all_sources(self): """Test aggregating data from all available sources.""" aggregator = ExternalVariantAggregator() # Use rs1045642 which is a common variant that should have 1000 Genomes data # Also provide variant data for cBioPortal variant_data = { "cadd": {"gene": {"genename": "ABCB1"}}, "docm": {"aa_change": "p.I1145I"}, } result = await aggregator.get_enhanced_annotations( "rs1045642", include_tcga=True, include_1000g=True, include_cbioportal=True, variant_data=variant_data, ) assert result.variant_id == "rs1045642" # Check which sources returned data sources_with_data = [] if result.tcga: sources_with_data.append("tcga") if result.thousand_genomes: sources_with_data.append("1000g") if result.cbioportal: sources_with_data.append("cbioportal") # This common variant should have at least 1000 Genomes data assert len(sources_with_data) > 0 # Specifically, it should have 1000 Genomes data assert result.thousand_genomes is not None # No errors should be reported for successful queries # (though some sources might not have data, which is different from errors) assert len(result.error_sources) == 0 @pytest.mark.asyncio @pytest.mark.integration async def test_selective_source_inclusion(self): """Test including only specific sources.""" aggregator = ExternalVariantAggregator() # Only request 1000 Genomes data result = await aggregator.get_enhanced_annotations( "rs1800734", # Common variant include_tcga=False, include_1000g=True, ) # Should only attempt to fetch 1000 Genomes data assert result.tcga is None # 1000 Genomes might have data for this common variant # (but it's okay if it doesn't) @pytest.mark.asyncio @pytest.mark.integration async def test_error_handling_resilience(self): """Test that aggregator handles individual source failures gracefully.""" aggregator = ExternalVariantAggregator() # Use an invalid variant format that might cause errors result = await aggregator.get_enhanced_annotations( "INVALID_VARIANT_FORMAT_12345", include_tcga=True, include_1000g=True, ) # Should still return a result even if all sources fail assert result is not None assert result.variant_id == "INVALID_VARIANT_FORMAT_12345" # Sources should return None or be in error_sources assert result.tcga is None assert result.thousand_genomes is None ``` -------------------------------------------------------------------------------- /docs/tutorials/biothings-prompts.md: -------------------------------------------------------------------------------- ```markdown # BioThings Integration Example Prompts This guide provides example prompts for AI assistants to effectively use the BioThings suite integration in BioMCP. ## Overview of BioThings Suite BioMCP integrates with the complete BioThings suite of APIs: - **MyGene.info** - Gene information and annotations - **MyDisease.info** - Disease ontology and synonyms - **MyVariant.info** - Genetic variant annotations (pre-existing integration, enhanced with BioThings client) - **MyChem.info** - Drug/chemical information and annotations All four services share common infrastructure through the BioThings client module, providing consistent error handling, rate limiting, and response parsing. ## Gene Information Retrieval ### Basic Gene Lookup ``` "What is the TP53 gene?" "Tell me about BRAF" "Get information on the EGFR gene" "What does the BRCA1 gene do?" ``` **Expected tool usage**: `gene_getter("TP53")` → Returns official name, summary, aliases ### Gene by ID ``` "Look up gene with Entrez ID 7157" "What is gene 673?" ``` **Expected tool usage**: `gene_getter("7157")` → Returns TP53 information ### Gene Context for Research ``` "I need to understand the KRAS gene before searching for mutations" "What type of protein does BRAF encode?" "Give me the official name and aliases for MYC" ``` ## Disease Information Retrieval ### Basic Disease Lookup ``` "What is GIST?" "Tell me about melanoma" "Define non-small cell lung cancer" "What is Erdheim-Chester disease?" ``` **Expected tool usage**: `disease_getter("GIST")` → Returns definition, synonyms, ontology IDs ### Disease by Ontology ID ``` "Look up disease MONDO:0018076" "What is DOID:1909?" ``` **Expected tool usage**: `disease_getter("MONDO:0018076")` → Returns disease information ### Disease Synonyms for Research ``` "What are all the names for gastrointestinal stromal tumor?" "Find synonyms for NSCLC" "What other terms are used for melanoma?" ``` ## Variant Information Retrieval (MyVariant.info) MyVariant.info is part of the BioThings suite and provides comprehensive variant annotations. BioMCP has extensive integration with specialized features: ### Basic Variant Lookup ``` "Get information about rs7412" "What is the BRAF V600E variant?" "Look up variant chr7:140453136-140453136" ``` **Expected tool usage**: `variant_getter("rs7412")` → Returns variant annotations with external database links ### Variant Search with Filters ``` "Find pathogenic variants in TP53" "Search for BRCA1 variants with high impact" "Get all loss-of-function variants in KRAS" ``` **Expected tool usage**: `variant_searcher(gene="TP53", significance="pathogenic")` → Returns filtered variant list ### Variant with Cancer Context ``` "What cancer types have BRAF V600E mutations?" "Get TCGA data for TP53 R273H" ``` **Expected tool usage**: Variant tools automatically integrate cBioPortal, TCGA, and 1000 Genomes data when available ## Drug Information Retrieval (MyChem.info) MyChem.info is part of the BioThings suite and provides comprehensive drug/chemical information. ### Basic Drug Lookup ``` "What is imatinib?" "Tell me about aspirin" "Get information on pembrolizumab" "What does metformin do?" ``` **Expected tool usage**: `drug_getter("imatinib")` → Returns drug information with database links ### Drug by ID ``` "Look up DrugBank ID DB00619" "What is CHEMBL941?" "Get details for PubChem CID 5291" ``` **Expected tool usage**: `drug_getter("DB00619")` → Returns drug details by identifier ### Drug Properties and Mechanism ``` "What is the mechanism of action of imatinib?" "Find the chemical formula for aspirin" "What are the trade names for adalimumab?" "How does pembrolizumab work?" ``` **Expected tool usage**: `drug_getter("pembrolizumab")` → Returns mechanism, indications, and properties ## Integrated Research Workflows ### Variant Analysis with Gene Context ``` "Analyze the BRAF V600E mutation - first tell me about the gene, then find pathogenic variants" ``` **Expected tool sequence**: 1. `think(thought="Analyzing BRAF V600E mutation", thoughtNumber=1)` 2. `gene_getter("BRAF")` → Gene context 3. `variant_searcher(gene="BRAF", hgvsp="V600E", significance="pathogenic")` → Variant details ### Clinical Trial Search with Disease Expansion ``` "Find clinical trials for GIST patients" "Search for trials treating gastrointestinal stromal tumors" ``` **Expected tool usage**: - `trial_searcher(conditions=["GIST"], expand_synonyms=True)` - Automatically searches for: GIST OR "gastrointestinal stromal tumor" OR "GI stromal tumor" ### Comprehensive Gene-Disease Research ``` "I'm researching EGFR mutations in lung cancer. Start with the gene, then the disease, then find relevant trials" ``` **Expected tool sequence**: 1. `think(thought="Researching EGFR in lung cancer", thoughtNumber=1)` 2. `gene_getter("EGFR")` → Gene information 3. `disease_getter("lung cancer")` → Disease context and synonyms 4. `trial_searcher(conditions=["lung cancer"], interventions=["EGFR inhibitor"])` → Trials with synonym expansion ### Multi-Gene Analysis ``` "Compare TP53, BRAF, and KRAS genes" "Tell me about the RAS family genes: KRAS, NRAS, HRAS" ``` **Expected tool usage**: Multiple `gene_getter()` calls for each gene ## Advanced Use Cases ### Gene Alias Resolution ``` "What is the official name for the p53 gene?" "Is TRP53 the same as TP53?" ``` **Expected tool usage**: `gene_getter("p53")` → Will resolve to TP53 ### Disease Name Disambiguation ``` "Is GIST the same as gastrointestinal stromal tumor?" "What's the MONDO ID for melanoma?" ``` **Expected tool usage**: `disease_getter("GIST")` → Shows all synonyms and IDs ### Trial Search Without Synonym Expansion ``` "Find trials specifically mentioning 'GIST' not other names" ``` **Expected tool usage**: `trial_searcher(conditions=["GIST"], expand_synonyms=False)` ### Integrated Literature and Gene Search ``` "Find recent papers about TP53 mutations - first tell me about the gene" ``` **Expected tool sequence**: 1. `gene_getter("TP53")` → Gene context 2. `article_searcher(genes=["TP53"], keywords=["mutation"])` → Literature ### Drug-Target Research ``` "I'm researching imatinib for CML treatment. Get drug info, then find trials" "What targets does pembrolizumab hit? Then find related articles" ``` **Expected tool sequence**: 1. `think(thought="Researching imatinib for CML", thoughtNumber=1)` 2. `drug_getter("imatinib")` → Drug information and mechanism 3. `trial_searcher(interventions=["imatinib"], conditions=["chronic myeloid leukemia"])` ## Tips for AI Assistants 1. **Always use think() first** for complex biomedical queries 2. **Gene context helps interpretation**: Get gene info before analyzing variants 3. **Disease synonyms improve search**: Use expand_synonyms=True (default) for comprehensive results 4. **Drug mechanisms matter**: Get drug info before searching trials to understand targets 5. **Real-time data**: All BioThings data is fetched live, ensuring current information 6. **Combine tools**: Gene + disease + variant + drug tools work together for comprehensive analysis ## Common Patterns ### Pattern 1: Gene → Variant → Clinical Impact ``` gene_getter("BRAF") → variant_searcher(gene="BRAF", significance="pathogenic") → article_searcher(genes=["BRAF"], diseases=["melanoma"]) ``` ### Pattern 2: Disease → Trials → Locations ``` disease_getter("NSCLC") → trial_searcher(conditions=["NSCLC"], expand_synonyms=True) → trial_locations_getter(nct_id="NCT...") ``` ### Pattern 3: Multi-Gene Pathway Analysis ``` gene_getter("EGFR") → gene_getter("KRAS") → gene_getter("BRAF") → article_searcher(genes=["EGFR", "KRAS", "BRAF"], keywords=["pathway"]) ``` ## Unified Search with BioThings Domains BioMCP's unified search now supports gene, drug, and disease domains alongside articles, trials, and variants: ### Domain-Specific Search ``` "Search for BRAF in the gene domain" "Find imatinib in drugs" "Look up melanoma in diseases" ``` **Expected tool usage**: - `search(domain="gene", keywords=["BRAF"])` - `search(domain="drug", keywords=["imatinib"])` - `search(domain="disease", keywords=["melanoma"])` ### Unified Query Language with BioThings ``` "genes.symbol:BRAF AND genes.type:protein-coding" "drugs.tradename:gleevec" "diseases.name:melanoma OR diseases.synonym:malignant melanoma" ``` **Expected tool usage**: Query parser automatically routes to appropriate domains ### Cross-Domain Gene Searches ``` "gene:BRAF" # Searches articles, variants, genes, and trials "Search everything about TP53" ``` **Expected behavior**: - Gene queries trigger searches across multiple domains - Results include gene info, variants, articles, and related trials ### Cross-Domain Disease Searches ``` "disease:melanoma" # Searches articles, trials, and diseases "Find all information about NSCLC" ``` **Expected behavior**: - Disease queries search articles, trials, and disease databases - Disease synonyms are automatically expanded in trial searches ### Combined Domain Queries ``` "gene:BRAF AND disease:melanoma" "drugs.indication:leukemia AND trials.phase:3" "genes.symbol:EGFR AND articles.year:>2023" ``` ### Unified Fetch ``` "Fetch BRAF from gene domain" "Get imatinib details from drugs" "Retrieve melanoma information from diseases" ``` **Expected tool usage**: - `fetch(id="BRAF", domain="gene")` - `fetch(id="imatinib", domain="drug")` - `fetch(id="melanoma", domain="disease")` ## Error Handling If a gene/disease is not found: - Check for typos or alternative names - Try searching with partial names - Use official symbols for genes (e.g., "TP53" not "p53 gene") - For diseases, try both common and medical names ``` -------------------------------------------------------------------------------- /src/biomcp/constants.py: -------------------------------------------------------------------------------- ```python """ Central constants file for BioMCP. This module contains all constants used throughout the BioMCP codebase, including API URLs, default values, limits, and domain configurations. """ # ============================================================================ # API Base URLs # ============================================================================ # PubTator3 API # https://www.ncbi.nlm.nih.gov/research/pubtator3/api PUBTATOR3_BASE_URL = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api" PUBTATOR3_SEARCH_URL = f"{PUBTATOR3_BASE_URL}/search/" PUBTATOR3_FULLTEXT_URL = f"{PUBTATOR3_BASE_URL}/publications/export/biocjson" PUBTATOR3_AUTOCOMPLETE_URL = f"{PUBTATOR3_BASE_URL}/entity/autocomplete/" # ClinicalTrials.gov API # https://clinicaltrials.gov/data-api/api CLINICAL_TRIALS_BASE_URL = "https://clinicaltrials.gov/api/v2/studies" CLINICAL_TRIALS_STUDY_URL = "https://clinicaltrials.gov/study/" # NCI Clinical Trials Search API # https://clinicaltrialsapi.cancer.gov/api/v2 NCI_CTS_BASE_URL = "https://clinicaltrialsapi.cancer.gov/api/v2" NCI_TRIALS_URL = f"{NCI_CTS_BASE_URL}/trials" NCI_ORGANIZATIONS_URL = f"{NCI_CTS_BASE_URL}/organizations" NCI_DISEASES_URL = f"{NCI_CTS_BASE_URL}/diseases" NCI_INTERVENTIONS_URL = f"{NCI_CTS_BASE_URL}/interventions" NCI_BIOMARKERS_URL = f"{NCI_CTS_BASE_URL}/biomarkers" NCI_API_KEY_ENV = "NCI_API_KEY" # MyVariant.info API # https://docs.myvariant.info/ MYVARIANT_BASE_URL = "https://myvariant.info/v1" MYVARIANT_QUERY_URL = f"{MYVARIANT_BASE_URL}/query" MYVARIANT_GET_URL = f"{MYVARIANT_BASE_URL}/variant" # Preprint Server APIs BIORXIV_BASE_URL = "https://api.biorxiv.org/details/biorxiv" MEDRXIV_BASE_URL = "https://api.biorxiv.org/details/medrxiv" EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" # External Variant APIs GDC_BASE_URL = "https://api.gdc.cancer.gov" GDC_SSMS_ENDPOINT_URL = f"{GDC_BASE_URL}/ssms" # Simple Somatic Mutations GDC_SSM_OCCURRENCES_URL = f"{GDC_BASE_URL}/ssm_occurrences" ENSEMBL_REST_BASE_URL = "https://rest.ensembl.org" ENSEMBL_VARIATION_URL = f"{ENSEMBL_REST_BASE_URL}/variation/human" CBIOPORTAL_BASE_URL = "https://www.cbioportal.org/api" # External Resource URLs PUBMED_BASE_URL = "https://pubmed.ncbi.nlm.nih.gov/" PMC_BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/articles/" DOI_BASE_URL = "https://doi.org/" DBSNP_BASE_URL = "https://www.ncbi.nlm.nih.gov/snp/" CLINVAR_BASE_URL = "https://www.ncbi.nlm.nih.gov/clinvar/variation/" COSMIC_BASE_URL = "https://cancer.sanger.ac.uk/cosmic/mutation/overview?id=" CIVIC_BASE_URL = "https://civicdb.org/variants/" ENSEMBL_VARIANT_BASE_URL = ( "https://ensembl.org/Homo_sapiens/Variation/Explore?v=" ) GENENAMES_BASE_URL = ( "https://www.genenames.org/data/gene-symbol-report/#!/symbol/" ) UCSC_GENOME_BROWSER_URL = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&" # ============================================================================ # Default Values and Limits # ============================================================================ # Caching DEFAULT_CACHE_TIMEOUT = 60 * 60 * 24 * 7 # 1 week in seconds # Pagination SYSTEM_PAGE_SIZE = ( 10 # Default page size for all searches (reduced for token efficiency) ) DEFAULT_PAGE_SIZE = 10 # Default page size for unified search MIN_PAGE_SIZE = 1 MAX_PAGE_SIZE = 100 DEFAULT_PAGE_NUMBER = 1 # Search limits MAX_RESULTS_PER_DOMAIN_DEFAULT = ( 10 # Default max results per domain in unified search ) ESTIMATED_ADDITIONAL_RESULTS = ( 100 # Estimate for additional results when full page returned ) DEFAULT_AUTOCOMPLETE_LIMIT = 1 MAX_AUTOCOMPLETE_LIMIT = 100 # Text display MAX_WIDTH = 72 # Maximum width for text wrapping in console output SNIPPET_LENGTH = 200 # Maximum length for text snippets in search results # Genome Assembly DEFAULT_ASSEMBLY = "hg19" # Default genome assembly for MyVariant.info API # Rate Limiting DEFAULT_RATE_LIMIT_PER_SECOND = 10.0 DEFAULT_BURST_SIZE = 20 SLIDING_WINDOW_MINUTE_LIMIT = 60 SLIDING_WINDOW_HOUR_LIMIT = 1000 # Retry Configuration DEFAULT_MAX_RETRY_ATTEMPTS = 3 DEFAULT_INITIAL_RETRY_DELAY = 1.0 DEFAULT_MAX_RETRY_DELAY = 60.0 DEFAULT_EXPONENTIAL_BASE = 2.0 AGGRESSIVE_MAX_RETRY_ATTEMPTS = 5 AGGRESSIVE_INITIAL_RETRY_DELAY = 2.0 AGGRESSIVE_MAX_RETRY_DELAY = 30.0 # Circuit Breaker Configuration DEFAULT_FAILURE_THRESHOLD = 10 DEFAULT_RECOVERY_TIMEOUT = 30.0 DEFAULT_SUCCESS_THRESHOLD = 3 # Metrics Configuration MAX_METRIC_SAMPLES = 1000 METRIC_PERCENTILE_50 = 0.50 METRIC_PERCENTILE_95 = 0.95 METRIC_PERCENTILE_99 = 0.99 METRIC_JITTER_RANGE = 0.1 # 10% jitter # HTTP Client Configuration HTTP_TIMEOUT_SECONDS = 120.0 HTTP_ERROR_CODE_NETWORK = 599 HTTP_ERROR_CODE_UNSUPPORTED_METHOD = 405 # Batching and Pagination Configuration DEFAULT_BATCH_SIZE = 10 DEFAULT_BATCH_TIMEOUT = 0.1 CBIOPORTAL_BATCH_SIZE = 5 EUROPE_PMC_PAGE_SIZE = 25 BIORXIV_MAX_PAGES = 3 BIORXIV_RESULTS_PER_PAGE = 30 BIORXIV_DEFAULT_DAYS_BACK = 365 # Prefetching Configuration PREFETCH_TOP_GENES = 5 PREFETCH_TOP_DISEASES = 3 PREFETCH_TOP_CHEMICALS = 3 PREFETCH_TIMEOUT = 2.0 # Cache Configuration REQUEST_CACHE_MAX_SIZE = 1000 CACHE_KEY_SAMPLE_SIZE = 100 # Connection Pool Configuration CONNECTION_POOL_MAX_KEEPALIVE = 20 CONNECTION_POOL_MAX_CONNECTIONS = 100 CONNECTION_POOL_KEEPALIVE_EXPIRY = 30 # ============================================================================ # Domain Configuration # ============================================================================ # Valid domains for search VALID_DOMAINS = [ "article", "trial", "variant", "gene", "drug", "disease", "nci_organization", "nci_intervention", "nci_biomarker", "nci_disease", # OpenFDA domains "fda_adverse", "fda_label", "fda_device", "fda_approval", "fda_recall", "fda_shortage", ] VALID_DOMAINS_PLURAL = [ "articles", "trials", "variants", "genes", "drugs", "diseases", "nci_organizations", "nci_interventions", "nci_biomarkers", "nci_diseases", # OpenFDA domains "fda_adverse_events", "fda_labels", "fda_device_events", "fda_approvals", "fda_recalls", "fda_shortages", ] # Domain mappings for unified search DOMAIN_TO_PLURAL = { "article": "articles", "trial": "trials", "variant": "variants", "gene": "genes", "drug": "drugs", "disease": "diseases", "nci_organization": "nci_organizations", "nci_intervention": "nci_interventions", "nci_biomarker": "nci_biomarkers", "nci_disease": "nci_diseases", # OpenFDA domains "fda_adverse": "fda_adverse_events", "fda_label": "fda_labels", "fda_device": "fda_device_events", "fda_approval": "fda_approvals", "fda_recall": "fda_recalls", "fda_shortage": "fda_shortages", } PLURAL_TO_DOMAIN = { "articles": "article", "trials": "trial", "variants": "variant", "genes": "gene", "drugs": "drug", "diseases": "disease", "nci_organizations": "nci_organization", "nci_interventions": "nci_intervention", "nci_biomarkers": "nci_biomarker", "nci_diseases": "nci_disease", # OpenFDA domains "fda_adverse_events": "fda_adverse", "fda_labels": "fda_label", "fda_device_events": "fda_device", "fda_approvals": "fda_approval", "fda_recalls": "fda_recall", "fda_shortages": "fda_shortage", } # Trial detail sections TRIAL_DETAIL_SECTIONS = [ "protocol", "locations", "outcomes", "references", "all", "full", ] # ============================================================================ # Field Names and Enums # ============================================================================ # Autocomplete concept types AUTOCOMPLETE_CONCEPTS = ["variant", "chemical", "disease", "gene"] # HTTP methods VALID_HTTP_METHODS = ["GET", "POST"] # Trial search defaults DEFAULT_TRIAL_FORMAT = "csv" DEFAULT_TRIAL_MARKUP = "markdown" # ============================================================================ # Error Messages # ============================================================================ ERROR_THOUGHT_NUMBER_MIN = "Error: thoughtNumber must be >= 1" ERROR_TOTAL_THOUGHTS_MIN = "Error: totalThoughts must be >= 1" ERROR_DOMAIN_REQUIRED = "Either 'query' or 'domain' parameter must be provided" ERROR_THOUGHT_REQUIRED = ( "'thought' parameter is required when domain='thinking'" ) ERROR_THOUGHT_NUMBER_REQUIRED = ( "'thoughtNumber' parameter is required when domain='thinking'" ) ERROR_TOTAL_THOUGHTS_REQUIRED = ( "'totalThoughts' parameter is required when domain='thinking'" ) ERROR_NEXT_THOUGHT_REQUIRED = ( "'nextThoughtNeeded' parameter is required when domain='thinking'" ) # ============================================================================ # API Response Formatting # ============================================================================ # Default values for missing data DEFAULT_TITLE = "Untitled" DEFAULT_GENE = "Unknown" DEFAULT_SIGNIFICANCE = "Unknown" # Metadata field names METADATA_YEAR = "year" METADATA_JOURNAL = "journal" METADATA_AUTHORS = "authors" METADATA_STATUS = "status" METADATA_PHASE = "phase" METADATA_START_DATE = "start_date" METADATA_COMPLETION_DATE = "primary_completion_date" METADATA_GENE = "gene" METADATA_RSID = "rsid" METADATA_SIGNIFICANCE = "clinical_significance" METADATA_CONSEQUENCE = "consequence" METADATA_SOURCE = "source" # Result field names RESULT_ID = "id" RESULT_TITLE = "title" RESULT_SNIPPET = "snippet" # Internal use for domain handlers RESULT_TEXT = "text" # OpenAI MCP compliant field name RESULT_URL = "url" RESULT_METADATA = "metadata" RESULT_DATA = "data" RESULT_PAGE = "page" RESULT_PAGE_SIZE = "page_size" RESULT_TOTAL = "total" RESULT_NEXT_PAGE = "next_page" ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/05-nci-cts-api.md: -------------------------------------------------------------------------------- ```markdown # NCI Clinical Trials Search API Reference The National Cancer Institute's Clinical Trials Search (CTS) API provides advanced search capabilities for cancer clinical trials with enhanced filtering options beyond ClinicalTrials.gov. ## Overview The NCI CTS API offers: - Advanced biomarker and mutation filtering - Comprehensive organization database - Intervention and drug vocabularies - Disease terminology with NCI Thesaurus integration - Prior therapy and eligibility criteria **Base URL:** `https://clinicaltrialsapi.cancer.gov/api/v2/` ## Authentication An API key is required for all endpoints. ### Obtaining an API Key 1. Visit [https://clinicaltrialsapi.cancer.gov/](https://clinicaltrialsapi.cancer.gov/) 2. Click "Get API Key" 3. Complete registration 4. Key is emailed immediately ### Using the API Key Include in request headers: ``` X-API-KEY: your-api-key-here ``` Or as query parameter: ``` ?api_key=your-api-key-here ``` ## Core Endpoints ### 1. Trial Search ``` GET /trials ``` Search for clinical trials with advanced filtering. #### Parameters **Basic Search:** - `keyword`: General text search - `nct_id`: Specific NCT identifiers - `diseases`: Disease/condition names - `interventions`: Treatment names **Advanced Filters:** - `biomarkers`: Required biomarkers/mutations - `prior_therapy_required`: true/false - `accepts_brain_mets`: true/false - `min_age`: Minimum age in years - `max_age`: Maximum age in years **Pagination:** - `size`: Results per page (max 50) - `from`: Starting index (offset) #### Example Request ```bash curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/trials" \ -H "X-API-KEY: your-key" \ -d "diseases=melanoma" \ -d "biomarkers=BRAF V600E" \ -d "accepts_brain_mets=true" \ -d "size=10" ``` #### Response Format ```json { "total": 42, "trials": [ { "nct_id": "NCT04280705", "brief_title": "BRAF/MEK Inhibitor Combination", "current_trial_status": "Active", "phase": "Phase II", "biomarker_eligibility": [ { "gene": "BRAF", "variant": "V600E", "required": true } ], "sites": [...] } ] } ``` ### 2. Trial Details ``` GET /trials/{nct_id} ``` Get comprehensive information about a specific trial. #### Example Request ```bash curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/trials/NCT04280705" \ -H "X-API-KEY: your-key" ``` ### 3. Organization Search ``` GET /organizations ``` Search for cancer research organizations and treatment centers. #### Parameters - `name`: Organization name - `org_city`: City location - `org_state_or_province`: State/province - `org_country`: Country - `org_type`: Type (e.g., "NCI-designated", "academic") **Important:** Always use city AND state together to avoid Elasticsearch errors. #### Example Request ```bash curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/organizations" \ -H "X-API-KEY: your-key" \ -d "org_city=Houston" \ -d "org_state_or_province=TX" ``` ### 4. Organization Details ``` GET /organizations/{org_id} ``` Get details about a specific organization. ### 5. Intervention Search ``` GET /interventions ``` Search for drugs, devices, and procedures used in trials. #### Parameters - `name`: Intervention name - `type`: Drug, Device, Procedure, etc. - `synonyms`: Include synonym matches (default: true) #### Example Request ```bash curl -X GET "https://clinicaltrialsapi.cancer.gov/api/v2/interventions" \ -H "X-API-KEY: your-key" \ -d "name=pembrolizumab" \ -d "type=Drug" ``` ### 6. Intervention Details ``` GET /interventions/{intervention_id} ``` ### 7. Biomarker Search ``` GET /biomarkers ``` Search for biomarkers used in trial eligibility criteria. #### Parameters - `name`: Biomarker name - `type`: mutation, expression, etc. - `gene`: Associated gene symbol ### 8. Disease Search ``` GET /diseases ``` Search NCI's controlled vocabulary of cancer conditions. #### Parameters - `name`: Disease name - `include_synonyms`: Include synonym matches - `category`: Disease category ## Advanced Features ### Biomarker-Based Trial Search Find trials requiring specific mutations: ```python params = { "diseases": "non-small cell lung cancer", "biomarkers": ["EGFR L858R", "EGFR exon 19 deletion"], "prior_therapy_required": False, "accepts_brain_mets": True } response = requests.get( "https://clinicaltrialsapi.cancer.gov/api/v2/trials", headers={"X-API-KEY": api_key}, params=params ) ``` ### Complex Eligibility Queries ```python # Find trials with specific eligibility params = { "diseases": "melanoma", "biomarkers": "BRAF V600E", "min_age": 18, "max_age": 75, "prior_therapy": "vemurafenib", # Exclude if prior vemurafenib "performance_status": "0-1" # ECOG 0 or 1 } ``` ### Organization Network Analysis ```python # Find all NCI-designated centers in a region params = { "org_type": "NCI-designated", "org_state_or_province": ["CA", "OR", "WA"] # West Coast } orgs = requests.get( "https://clinicaltrialsapi.cancer.gov/api/v2/organizations", headers={"X-API-KEY": api_key}, params=params ) # Get trials at each center for org in orgs.json()["organizations"]: trials = requests.get( f"https://clinicaltrialsapi.cancer.gov/api/v2/trials", headers={"X-API-KEY": api_key}, params={"site_org_id": org["id"]} ) ``` ## Data Models ### Trial Object ```json { "nct_id": "NCT04280705", "brief_title": "Study Title", "official_title": "Full Protocol Title", "current_trial_status": "Active", "phase": "Phase II", "study_type": "Interventional", "primary_purpose": "Treatment", "diseases": [ { "name": "Melanoma", "nci_thesaurus_id": "C0025202" } ], "biomarker_eligibility": [ { "gene": "BRAF", "variant": "V600E", "required": true, "inclusion": true } ], "arms": [...], "sites": [...] } ``` ### Organization Object ```json { "org_id": "NCI-2021-00123", "name": "MD Anderson Cancer Center", "type": "NCI-designated", "address": { "city": "Houston", "state": "TX", "country": "United States", "postal_code": "77030" }, "contact": { "name": "Clinical Trials Office", "phone": "1-800-392-1611", "email": "[email protected]" }, "active_trials_count": 1250 } ``` ## Error Handling ### Common Errors #### 401 Unauthorized ```json { "error": "Invalid or missing API key" } ``` #### 400 Bad Request ```json { "error": "Invalid parameter combination", "details": "Must specify both city AND state for location search" } ``` #### 429 Rate Limited ```json { "error": "Rate limit exceeded", "retry_after": 3600 } ``` ### Best Practices 1. **Always use city AND state together** for location searches 2. **Handle missing totals** - the API may not return total counts with size parameter 3. **Use specific searches** - broad queries may timeout 4. **Implement retry logic** for rate limits ## Rate Limits - **With API Key**: 1,000 requests/day - **Burst Rate**: 10 requests/second - **Without Key**: Not supported ## Differences from ClinicalTrials.gov ### Enhanced Features - **Biomarker search**: Mutation-specific queries - **Prior therapy**: Exclude based on previous treatments - **Brain metastases**: Specific acceptance criteria - **Performance status**: ECOG/Karnofsky filtering ### Limitations - **Cancer trials only**: Limited to oncology studies - **No offset pagination**: Must use size parameter carefully - **Location parameters**: Different naming (org\_ prefix) ## Integration Examples ### Example 1: Precision Medicine Search ```python async def find_precision_trials(mutation, cancer_type, location): """Find trials for specific mutation in cancer type near location""" # Search for trials trial_params = { "diseases": cancer_type, "biomarkers": mutation, "accepts_brain_mets": True, "size": 50 } trials = await fetch_nci_api("trials", trial_params) # Filter by location if provided if location: nearby_trials = [] for trial in trials["trials"]: for site in trial.get("sites", []): distance = calculate_distance(location, site["coordinates"]) if distance < 100: # 100 miles nearby_trials.append(trial) break return nearby_trials return trials["trials"] ``` ### Example 2: Biomarker-Driven Pipeline ```python def biomarker_trial_pipeline(gene, variant): """Complete pipeline from variant to trials""" # 1. Search biomarkers biomarkers = requests.get( "https://clinicaltrialsapi.cancer.gov/api/v2/biomarkers", headers={"X-API-KEY": api_key}, params={"gene": gene, "name": variant} ).json() # 2. Get associated trials all_trials = [] for biomarker in biomarkers.get("biomarkers", []): trials = requests.get( "https://clinicaltrialsapi.cancer.gov/api/v2/trials", headers={"X-API-KEY": api_key}, params={"biomarker_id": biomarker["id"]} ).json() all_trials.extend(trials.get("trials", [])) # 3. Deduplicate and sort by phase unique_trials = {t["nct_id"]: t for t in all_trials}.values() return sorted(unique_trials, key=lambda x: x.get("phase", "")) ``` ## Support Resources - **API Documentation**: [https://clinicaltrialsapi.cancer.gov/](https://clinicaltrialsapi.cancer.gov/) - **Support Email**: [email protected] - **Status Page**: [https://status.cancer.gov/](https://status.cancer.gov/) - **Terms of Use**: [https://clinicaltrialsapi.cancer.gov/terms](https://clinicaltrialsapi.cancer.gov/terms) ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/drug_approvals.py: -------------------------------------------------------------------------------- ```python """ OpenFDA drug approvals (Drugs@FDA) integration. """ import logging from typing import Any from .constants import ( OPENFDA_DEFAULT_LIMIT, OPENFDA_DISCLAIMER, OPENFDA_DRUGSFDA_URL, ) from .utils import ( format_count, make_openfda_request, ) logger = logging.getLogger(__name__) async def search_drug_approvals( drug: str | None = None, application_number: str | None = None, approval_year: str | None = None, limit: int = OPENFDA_DEFAULT_LIMIT, skip: int = 0, api_key: str | None = None, ) -> str: """ Search FDA drug approval records from Drugs@FDA. Args: drug: Drug name (brand or generic) to search for application_number: NDA or BLA application number approval_year: Year of approval (YYYY format) limit: Maximum number of results to return skip: Number of results to skip (for pagination) api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) Returns: Formatted string with drug approval information """ # Build search query search_params = {} if drug: # Search both brand and generic names search_params["search"] = ( f'(openfda.brand_name:"{drug}" OR ' f'openfda.generic_name:"{drug}" OR ' f'openfda.substance_name:"{drug}")' ) elif application_number: search_params["search"] = f'application_number:"{application_number}"' elif approval_year: # Search for approvals in a specific year search_params["search"] = ( f"products.marketing_status_date:[{approval_year}-01-01 TO {approval_year}-12-31]" ) # Add pagination search_params["limit"] = str(min(limit, 100)) search_params["skip"] = str(skip) # Sort by submission date (most recent first) search_params["sort"] = "submissions.submission_status_date:desc" # Make the request response, error = await make_openfda_request( OPENFDA_DRUGSFDA_URL, search_params, "openfda_approvals", api_key ) if error: return f"⚠️ Error searching drug approvals: {error}" if not response or not response.get("results"): return "No drug approval records found matching your criteria." # Format the results results = response["results"] total = ( response.get("meta", {}).get("results", {}).get("total", len(results)) ) output = ["## FDA Drug Approval Records\n"] if drug: output.append(f"**Drug**: {drug}") if application_number: output.append(f"**Application**: {application_number}") if approval_year: output.append(f"**Approval Year**: {approval_year}") output.append( f"**Total Records Found**: {format_count(total, 'record')}\n" ) # Show results output.append(f"### Results (showing {len(results)} of {total}):\n") for i, record in enumerate(results, 1): output.extend(_format_approval_summary(record, i)) output.append(f"\n{OPENFDA_DISCLAIMER}") return "\n".join(output) async def get_drug_approval( application_number: str, api_key: str | None = None, ) -> str: """ Get detailed drug approval information for a specific application. Args: application_number: NDA or BLA application number api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) Returns: Formatted string with detailed approval information """ # Search for the specific application search_params = { "search": f'application_number:"{application_number}"', "limit": 1, } response, error = await make_openfda_request( OPENFDA_DRUGSFDA_URL, search_params, "openfda_approvals", api_key ) if error: return f"⚠️ Error retrieving drug approval: {error}" if not response or not response.get("results"): return f"No approval record found for application {application_number}" record = response["results"][0] # Format detailed approval information output = [f"## Drug Approval Details: {application_number}\n"] # Basic information output.extend(_format_approval_header(record)) # Products if products := record.get("products"): output.extend(_format_products(products)) # Submissions history if submissions := record.get("submissions"): output.extend(_format_submissions(submissions)) # OpenFDA metadata if openfda := record.get("openfda"): output.extend(_format_openfda_metadata(openfda)) output.append(f"\n{OPENFDA_DISCLAIMER}") return "\n".join(output) def _format_approval_summary(record: dict[str, Any], num: int) -> list[str]: """Format a single approval record summary.""" output = [ f"#### {num}. Application {record.get('application_number', 'Unknown')}" ] # Get sponsor/applicant if sponsor := record.get("sponsor_name"): output.append(f"**Sponsor**: {sponsor}") # Get drug names from OpenFDA data openfda = record.get("openfda", {}) if brand_names := openfda.get("brand_name"): output.append(f"**Brand Name(s)**: {', '.join(brand_names[:3])}") if generic_names := openfda.get("generic_name"): output.append(f"**Generic Name(s)**: {', '.join(generic_names[:3])}") # Get products and their approval dates if products := record.get("products"): output.append("\n**Products**:") for prod in products[:3]: prod_num = prod.get("product_number", "?") dosage = prod.get("dosage_form", "") strength = prod.get("strength", "") status = prod.get("marketing_status", "") prod_line = f"- Product {prod_num}: {dosage}" if strength: prod_line += f" ({strength})" if status: prod_line += f" - {status}" output.append(prod_line) # Get most recent submission if submissions := record.get("submissions"): # Sort by date to get most recent recent = submissions[0] sub_type = recent.get("submission_type", "") sub_status = recent.get("submission_status", "") sub_date = recent.get("submission_status_date", "") if sub_date: output.append( f"\n**Latest Activity**: {sub_type} - {sub_status} ({sub_date})" ) output.append("") return output def _format_approval_header(record: dict[str, Any]) -> list[str]: """Format the header section of detailed approval.""" output = ["### Application Information"] output.append( f"**Application Number**: {record.get('application_number', 'Unknown')}" ) if sponsor := record.get("sponsor_name"): output.append(f"**Sponsor**: {sponsor}") # OpenFDA names openfda = record.get("openfda", {}) if brand_names := openfda.get("brand_name"): output.append(f"**Brand Names**: {', '.join(brand_names)}") if generic_names := openfda.get("generic_name"): output.append(f"**Generic Names**: {', '.join(generic_names)}") if substances := openfda.get("substance_name"): output.append(f"**Active Substances**: {', '.join(substances)}") output.append("") return output def _format_products(products: list[dict[str, Any]]) -> list[str]: """Format product information.""" output = ["### Products"] for prod in products: prod_num = prod.get("product_number", "Unknown") output.append(f"\n#### Product {prod_num}") if dosage := prod.get("dosage_form"): output.append(f"**Dosage Form**: {dosage}") if strength := prod.get("strength"): output.append(f"**Strength**: {strength}") if route := prod.get("route"): output.append(f"**Route**: {route}") if status := prod.get("marketing_status"): output.append(f"**Marketing Status**: {status}") if status_date := prod.get("marketing_status_date"): output.append(f"**Status Date**: {status_date}") if te_code := prod.get("te_code"): output.append(f"**Therapeutic Equivalence**: {te_code}") output.append("") return output def _format_submissions(submissions: list[dict[str, Any]]) -> list[str]: """Format submission history.""" output = ["### Submission History"] # Show most recent 5 submissions for sub in submissions[:5]: sub_num = sub.get("submission_number", "?") sub_type = sub.get("submission_type", "Unknown") sub_status = sub.get("submission_status", "") sub_date = sub.get("submission_status_date", "") output.append(f"\n**Submission {sub_num}**: {sub_type}") if sub_status: output.append(f"- Status: {sub_status}") if sub_date: output.append(f"- Date: {sub_date}") # Review priority if present if priority := sub.get("review_priority"): output.append(f"- Review Priority: {priority}") # Submission class if present if sub_class := sub.get("submission_class_code"): class_desc = sub.get("submission_class_code_description", "") output.append(f"- Class: {sub_class} - {class_desc}") output.append("") return output def _format_openfda_metadata(openfda: dict[str, Any]) -> list[str]: """Format OpenFDA metadata.""" output = ["### Additional Information"] if nui := openfda.get("nui"): output.append(f"**NUI Codes**: {', '.join(nui[:5])}") if pharm_class := openfda.get("pharm_class_epc"): output.append(f"**Pharmacologic Class**: {', '.join(pharm_class[:3])}") if moa := openfda.get("pharm_class_moa"): output.append(f"**Mechanism of Action**: {', '.join(moa[:3])}") if unii := openfda.get("unii"): output.append(f"**UNII Codes**: {', '.join(unii[:5])}") output.append("") return output ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_alphagenome_comprehensive.py: -------------------------------------------------------------------------------- ```python """Comprehensive tests for AlphaGenome integration.""" from unittest.mock import MagicMock, patch import pandas as pd import pytest from biomcp.variants.alphagenome import ( _validate_inputs, predict_variant_effects, ) class TestInputValidation: """Test input validation for AlphaGenome.""" def test_valid_chromosomes(self): """Test validation accepts valid chromosome formats.""" valid_chroms = ["chr1", "chr22", "chrX", "chrY", "chrM", "chrMT"] for chrom in valid_chroms: # Should not raise _validate_inputs(chrom, 100, "A", "T") def test_invalid_chromosomes(self): """Test validation rejects invalid chromosome formats.""" invalid_chroms = ["1", "chr23", "chrZ", "chromosome1", "Chr1", ""] for chrom in invalid_chroms: with pytest.raises(ValueError, match="Invalid chromosome format"): _validate_inputs(chrom, 100, "A", "T") def test_invalid_position(self): """Test validation rejects invalid positions.""" with pytest.raises(ValueError, match="Position must be >= 1"): _validate_inputs("chr1", 0, "A", "T") with pytest.raises(ValueError, match="Position must be >= 1"): _validate_inputs("chr1", -10, "A", "T") def test_valid_nucleotides(self): """Test validation accepts valid nucleotides.""" valid_cases = [ ("A", "T"), ("C", "G"), ("ACGT", "TGCA"), ("a", "t"), ("acgt", "tgca"), # lowercase should work ] for ref, alt in valid_cases: # Should not raise _validate_inputs("chr1", 100, ref, alt) def test_invalid_nucleotides(self): """Test validation rejects invalid nucleotides.""" invalid_cases = [("N", "A"), ("A", "U"), ("AXG", "T"), ("A", "123")] for ref, alt in invalid_cases: with pytest.raises(ValueError, match="Invalid nucleotides"): _validate_inputs("chr1", 100, ref, alt) def test_empty_alleles(self): """Test validation rejects empty alleles.""" with pytest.raises( ValueError, match="Reference allele cannot be empty" ): _validate_inputs("chr1", 100, "", "A") with pytest.raises( ValueError, match="Alternate allele cannot be empty" ): _validate_inputs("chr1", 100, "A", "") class TestIntervalSizeCalculation: """Test interval size selection logic.""" @pytest.mark.asyncio async def test_interval_size_edge_cases(self): """Test interval size selection for edge cases.""" with patch.dict("os.environ", {}, clear=True): # Without API key, we should get early return result = await predict_variant_effects( chromosome="chr1", position=100, reference="A", alternate="T", interval_size=2000000, # Larger than max ) assert "AlphaGenome API key required" in result class TestCaching: """Test caching behavior.""" @pytest.mark.asyncio async def test_skip_cache_parameter(self): """Test that skip_cache parameter works.""" with patch.dict("os.environ", {}, clear=True): # First call result1 = await predict_variant_effects( chromosome="chr1", position=100, reference="A", alternate="T", skip_cache=True, ) # Second call with skip_cache result2 = await predict_variant_effects( chromosome="chr1", position=100, reference="A", alternate="T", skip_cache=True, ) # Both should show API key error assert "AlphaGenome API key required" in result1 assert "AlphaGenome API key required" in result2 class TestErrorHandling: """Test error handling and context.""" @pytest.mark.asyncio async def test_error_context_with_api_key(self): """Test that errors include proper context.""" with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): result = await predict_variant_effects( chromosome="chr1", position=100, reference="A", alternate="T", tissue_types=["UBERON:0002367"], skip_cache=True, ) # Should either get import error or API error with context if "AlphaGenome prediction failed" in result: assert "Context:" in result assert "chr1:100 A>T" in result assert "Tissue types:" in result @pytest.mark.asyncio async def test_input_validation_errors(self): """Test that input validation errors are raised.""" with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): # Invalid chromosome with pytest.raises(ValueError, match="Invalid chromosome format"): await predict_variant_effects( chromosome="invalid", position=100, reference="A", alternate="T", ) # Invalid nucleotides with pytest.raises(ValueError, match="Invalid nucleotides"): await predict_variant_effects( chromosome="chr1", position=100, reference="X", alternate="T", ) class TestThresholdParameter: """Test significance threshold parameter.""" @pytest.mark.asyncio async def test_custom_threshold(self): """Test that custom threshold is accepted.""" with patch.dict("os.environ", {}, clear=True): # Test with custom threshold result = await predict_variant_effects( chromosome="chr1", position=100, reference="A", alternate="T", significance_threshold=0.8, ) # Should work (get API key error, not parameter error) assert "AlphaGenome API key required" in result @pytest.mark.asyncio async def test_default_threshold(self): """Test that default threshold is used.""" with patch.dict("os.environ", {}, clear=True): # Test without threshold parameter result = await predict_variant_effects( chromosome="chr1", position=100, reference="A", alternate="T", ) # Should work with default assert "AlphaGenome API key required" in result class TestIntegration: """Integration tests with mocked AlphaGenome.""" @pytest.mark.asyncio async def test_successful_prediction_mock(self): """Test successful prediction with mocked AlphaGenome.""" with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "test-key"}): # Mock the AlphaGenome imports mock_genome = MagicMock() mock_dna_client = MagicMock() mock_variant_scorers = MagicMock() # Mock the model mock_model = MagicMock() mock_dna_client.create.return_value = mock_model # Mock scorers mock_variant_scorers.get_recommended_scorers.return_value = [ "scorer1" ] # Mock scores DataFrame mock_df = pd.DataFrame({ "output_type": ["RNA_SEQ"], "raw_score": [1.0], "gene_name": ["GENE1"], "track_name": ["tissue1"], }) mock_variant_scorers.tidy_scores.return_value = mock_df # Mock score_variant to return mock scores mock_model.score_variant.return_value = [MagicMock()] # Patch the imports with patch.dict( "sys.modules", { "alphagenome.data.genome": mock_genome, "alphagenome.models.dna_client": mock_dna_client, "alphagenome.models.variant_scorers": mock_variant_scorers, "alphagenome.data": MagicMock(genome=mock_genome), "alphagenome.models": MagicMock( dna_client=mock_dna_client, variant_scorers=mock_variant_scorers, ), }, ): result = await predict_variant_effects( chromosome="chr7", position=140753336, reference="A", alternate="T", interval_size=131072, skip_cache=True, ) # Check model was created with API key mock_dna_client.create.assert_called_once_with("test-key") # Check interval was created correctly mock_genome.Interval.assert_called_once() call_args = mock_genome.Interval.call_args assert ( call_args[1]["start"] == 140753336 - 65536 - 1 ) # 0-based assert call_args[1]["end"] == call_args[1]["start"] + 131072 # Check variant was created mock_genome.Variant.assert_called_once_with( chromosome="chr7", position=140753336, reference_bases="A", alternate_bases="T", ) # Check result contains expected formatting assert "AlphaGenome Variant Effect Predictions" in result assert "Gene Expression" in result assert "GENE1" in result ``` -------------------------------------------------------------------------------- /src/biomcp/trials/getter.py: -------------------------------------------------------------------------------- ```python import json import logging from ssl import TLSVersion from typing import Annotated, Any from .. import StrEnum, http_client, render from ..constants import CLINICAL_TRIALS_BASE_URL logger = logging.getLogger(__name__) class Module(StrEnum): PROTOCOL = "Protocol" LOCATIONS = "Locations" REFERENCES = "References" OUTCOMES = "Outcomes" ALL = "All" modules: dict[Module, list[str]] = { Module.PROTOCOL: [ "IdentificationModule", "StatusModule", "SponsorCollaboratorsModule", "OversightModule", "DescriptionModule", "ConditionsModule", "DesignModule", "ArmsInterventionsModule", "EligibilityModule", ], Module.LOCATIONS: ["ContactsLocationsModule"], Module.REFERENCES: ["ReferencesModule"], Module.OUTCOMES: ["OutcomesModule", "ResultsSection"], Module.ALL: [ "IdentificationModule", "StatusModule", "SponsorCollaboratorsModule", "OversightModule", "DescriptionModule", "ConditionsModule", "DesignModule", "ArmsInterventionsModule", "EligibilityModule", "ContactsLocationsModule", "ReferencesModule", "OutcomesModule", "ResultsSection", ], } async def get_trial( nct_id: str, module: Module = Module.PROTOCOL, output_json: bool = False, ) -> str: """Get details of a clinical trial by module.""" fields = ",".join(modules[module]) params = {"fields": fields} url = f"{CLINICAL_TRIALS_BASE_URL}/{nct_id}" logger.debug(f"Fetching trial {nct_id} with module {module.value}") logger.debug(f"URL: {url}, Params: {params}") parsed_data: dict[str, Any] | None error_obj: http_client.RequestError | None parsed_data, error_obj = await http_client.request_api( url=url, request=params, method="GET", tls_version=TLSVersion.TLSv1_2, response_model_type=None, domain="clinicaltrials", ) data_to_return: dict[str, Any] if error_obj: logger.error( f"API Error for {nct_id}: {error_obj.code} - {error_obj.message}" ) data_to_return = { "error": f"API Error {error_obj.code}", "details": error_obj.message, } elif parsed_data: # ClinicalTrials.gov API returns data wrapped in a "studies" array # Extract the first study if it exists if isinstance(parsed_data, dict) and "studies" in parsed_data: studies = parsed_data.get("studies", []) if studies and len(studies) > 0: data_to_return = studies[0] data_to_return["URL"] = ( f"https://clinicaltrials.gov/study/{nct_id}" ) else: logger.warning(f"No studies found in response for {nct_id}") data_to_return = { "error": f"No studies found for {nct_id}", "details": "API returned empty studies array", } else: # Handle case where API returns data in unexpected format logger.debug( f"Unexpected response format for {nct_id}: {type(parsed_data)}" ) data_to_return = parsed_data data_to_return["URL"] = ( f"https://clinicaltrials.gov/study/{nct_id}" ) else: logger.warning( f"No data received for {nct_id} with module {module.value}" ) data_to_return = { "error": f"No data found for {nct_id} with module {module.value}", "details": "API returned no data", } if output_json: return json.dumps(data_to_return, indent=2) else: return render.to_markdown(data_to_return) async def _trial_protocol( call_benefit: Annotated[ str, "Define and summarize why this function is being called and the intended benefit", ], nct_id: str, ): """ Retrieves core protocol information for a single clinical trial identified by its NCT ID. Parameters: - call_benefit: Define and summarize why this function is being called and the intended benefit - nct_id: A single NCT ID (string, e.g., "NCT04280705") Process: Fetches standard "Protocol" view modules (like ID, Status, Sponsor, Design, Eligibility) from the ClinicalTrials.gov v2 API. Output: A Markdown formatted string detailing title, status, sponsor, purpose, study design, phase, interventions, eligibility criteria, etc. Returns error if invalid. """ return await get_trial(nct_id, Module.PROTOCOL) async def _trial_locations( call_benefit: Annotated[ str, "Define and summarize why this function is being called and the intended benefit", ], nct_id: str, ) -> str: """ Retrieves contact and location details for a single clinical trial identified by its NCT ID. Parameters: - call_benefit: Define and summarize why this function is being called and the intended benefit - nct_id: A single NCT ID (string, e.g., "NCT04280705") Process: Fetches the `ContactsLocationsModule` from the ClinicalTrials.gov v2 API for the given NCT ID. Output: A Markdown formatted string detailing facility names, addresses (city, state, country), and contact info. Returns an error message if the NCT ID is invalid. """ return await get_trial(nct_id, Module.LOCATIONS) async def _trial_outcomes( call_benefit: Annotated[ str, "Define and summarize why this function is being called and the intended benefit", ], nct_id: str, ) -> str: """ Retrieves outcome measures, results (if available), and adverse event data for a single clinical trial. Parameters: - call_benefit: Define and summarize why this function is being called and the intended benefit - nct_id: A single NCT ID (string, e.g., "NCT04280705") Process: Fetches the `OutcomesModule` and `ResultsSection` from the ClinicalTrials.gov v2 API for the NCT ID. Output: A Markdown formatted string detailing primary/secondary outcomes, participant flow, results tables (if posted), and adverse event summaries. Returns an error if invalid. """ return await get_trial(nct_id, Module.OUTCOMES) async def _trial_references( call_benefit: Annotated[ str, "Define and summarize why this function is being called and the intended benefit", ], nct_id: str, ): """ Retrieves publications and other references associated with a single clinical trial identified by its NCT ID. Parameters: - call_benefit: Define and summarize why this function is being called and the intended benefit - nct_id: A single NCT ID (string, e.g., "NCT04280705") Process: Fetches the `ReferencesModule` from the ClinicalTrials.gov v2 API for the NCT ID. Output: A Markdown formatted string listing citations, associated PubMed IDs (PMIDs), and reference types (e.g., result publication). Returns error if invalid. """ return await get_trial(nct_id, Module.REFERENCES) async def get_trial_unified( nct_id: str, source: str = "clinicaltrials", api_key: str | None = None, sections: list[str] | None = None, ) -> str: """ Get trial details from either ClinicalTrials.gov or NCI CTS API. Args: nct_id: NCT identifier (e.g., "NCT04280705") source: Data source - "clinicaltrials" (default) or "nci" api_key: API key for NCI (required if source="nci") sections: List of sections to include (for clinicaltrials.gov) Options: ["protocol", "locations", "outcomes", "references", "all"] Returns: Formatted markdown string with trial details """ if source == "nci": # Import here to avoid circular imports from .nci_getter import format_nci_trial_details, get_trial_nci trial_data = await get_trial_nci(nct_id, api_key) return await format_nci_trial_details(trial_data, api_key) else: # Default to ClinicalTrials.gov if sections and "all" in sections: return await get_trial(nct_id, Module.ALL) elif sections: # Get specific sections results = [] for section in sections: if section == "protocol": results.append( await _trial_protocol( call_benefit=f"Getting protocol information for trial {nct_id}", nct_id=nct_id, ) ) elif section == "locations": results.append( await _trial_locations( call_benefit=f"Getting locations for trial {nct_id}", nct_id=nct_id, ) ) elif section == "outcomes": results.append( await _trial_outcomes( call_benefit=f"Getting outcomes for trial {nct_id}", nct_id=nct_id, ) ) elif section == "references": results.append( await _trial_references( call_benefit=f"Getting references for trial {nct_id}", nct_id=nct_id, ) ) return "\n\n---\n\n".join(results) else: # Default to protocol only return await _trial_protocol( call_benefit=f"Getting trial protocol details for {nct_id}", nct_id=nct_id, ) ```