This is page 6 of 15. Use http://codebase.md/genomoncology/biomcp?page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /src/biomcp/biomarkers/search.py: -------------------------------------------------------------------------------- ```python """Search functionality for biomarkers via NCI CTS API. Note: Biomarker data availability may be limited in CTRP. This module focuses on biomarkers used in trial eligibility criteria. """ import logging from typing import Any from ..constants import NCI_BIOMARKERS_URL from ..integrations.cts_api import CTSAPIError, make_cts_request from ..utils import parse_or_query logger = logging.getLogger(__name__) def _build_biomarker_params( name: str | None, eligibility_criterion: str | None, biomarker_type: str | None, codes: list[str] | None, assay_purpose: str | None, include: list[str] | None, sort: str | None, order: str | None, page_size: int, ) -> dict[str, Any]: """Build query parameters for biomarker search.""" params: dict[str, Any] = {"size": page_size} # Add search filters with correct API parameter names if name: params["name"] = name if eligibility_criterion: params["eligibility_criterion"] = eligibility_criterion if biomarker_type: params["type"] = biomarker_type if codes: params["codes"] = ",".join(codes) if isinstance(codes, list) else codes if assay_purpose: params["assay_purpose"] = assay_purpose if include: params["include"] = ( ",".join(include) if isinstance(include, list) else include ) if sort: params["sort"] = sort if order: params["order"] = order.lower() return params def _process_biomarker_response( response: dict[str, Any], page: int, page_size: int, ) -> dict[str, Any]: """Process biomarker API response.""" biomarkers = response.get("data", response.get("biomarkers", [])) total = response.get("total", len(biomarkers)) result = { "biomarkers": biomarkers, "total": total, "page": page, "page_size": page_size, } # Add note about data limitations if response indicates it if response.get("limited_data") or not biomarkers: result["note"] = ( "Biomarker data availability is limited in CTRP. " "Results show biomarkers referenced in trial eligibility criteria. " "For detailed variant annotations, use variant_searcher with MyVariant.info." ) return result async def search_biomarkers( name: str | None = None, eligibility_criterion: str | None = None, biomarker_type: str | None = None, codes: list[str] | None = None, assay_purpose: str | None = None, include: list[str] | None = None, sort: str | None = None, order: str | None = None, page_size: int = 20, page: int = 1, api_key: str | None = None, ) -> dict[str, Any]: """ Search for biomarkers in the NCI CTS database. Note: Biomarker data availability may be limited per CTRP documentation. Results focus on biomarkers used in clinical trial eligibility criteria. Args: name: Biomarker name to search for (e.g., "PD-L1", "EGFR mutation") eligibility_criterion: Eligibility criterion text biomarker_type: Type of biomarker ("reference_gene" or "branch") codes: List of biomarker codes assay_purpose: Purpose of the assay include: Fields to include in response sort: Sort field order: Sort order ('asc' or 'desc') page_size: Number of results per page page: Page number api_key: Optional API key (if not provided, uses NCI_API_KEY env var) Returns: Dictionary with search results containing: - biomarkers: List of biomarker records - total: Total number of results - page: Current page - page_size: Results per page - note: Any limitations about the data Raises: CTSAPIError: If the API request fails """ # Build query parameters params = _build_biomarker_params( name, eligibility_criterion, biomarker_type, codes, assay_purpose, include, sort, order, page_size, ) try: # Make API request response = await make_cts_request( url=NCI_BIOMARKERS_URL, params=params, api_key=api_key, ) # Process response return _process_biomarker_response(response, page, page_size) except CTSAPIError: raise except Exception as e: logger.error(f"Failed to search biomarkers: {e}") raise CTSAPIError(f"Biomarker search failed: {e!s}") from e def _format_biomarker_header(total: int, note: str) -> list[str]: """Format the header section of biomarker results.""" lines = [ f"## Biomarker Search Results ({total} found)", "", ] if note: lines.extend([ f"*Note: {note}*", "", ]) return lines def _format_single_biomarker(biomarker: dict[str, Any]) -> list[str]: """Format a single biomarker record.""" bio_id = biomarker.get("id", biomarker.get("biomarker_id", "Unknown")) name = biomarker.get("name", "Unknown Biomarker") gene = biomarker.get("gene", biomarker.get("gene_symbol", "")) bio_type = biomarker.get("type", biomarker.get("category", "")) lines = [ f"### {name}", f"- **ID**: {bio_id}", ] if gene: lines.append(f"- **Gene**: {gene}") if bio_type: lines.append(f"- **Type**: {bio_type}") # Add assay information if available if biomarker.get("assay_type"): lines.append(f"- **Assay**: {biomarker['assay_type']}") # Add criteria examples if available if biomarker.get("criteria_examples"): examples = biomarker["criteria_examples"] if isinstance(examples, list) and examples: lines.append("- **Example Criteria**:") for ex in examples[:3]: # Show up to 3 examples lines.append(f" - {ex}") if len(examples) > 3: lines.append(f" *(and {len(examples) - 3} more)*") # Add trial count if available if biomarker.get("trial_count"): lines.append( f"- **Trials Using This Biomarker**: {biomarker['trial_count']}" ) lines.append("") return lines async def search_biomarkers_with_or( name_query: str, eligibility_criterion: str | None = None, biomarker_type: str | None = None, codes: list[str] | None = None, assay_purpose: str | None = None, include: list[str] | None = None, sort: str | None = None, order: str | None = None, page_size: int = 20, page: int = 1, api_key: str | None = None, ) -> dict[str, Any]: """ Search for biomarkers with OR query support. This function handles OR queries by making multiple API calls and combining results. For example: "PD-L1 OR CD274 OR programmed death ligand 1" will search for each term. Args: name_query: Name query that may contain OR operators Other args same as search_biomarkers Returns: Combined results from all searches with duplicates removed """ # Check if this is an OR query if " OR " in name_query or " or " in name_query: search_terms = parse_or_query(name_query) logger.info(f"Parsed OR query into terms: {search_terms}") else: # Single term search search_terms = [name_query] # Collect all unique biomarkers all_biomarkers = {} total_found = 0 # Search for each term for term in search_terms: logger.info(f"Searching biomarkers for term: {term}") try: results = await search_biomarkers( name=term, eligibility_criterion=eligibility_criterion, biomarker_type=biomarker_type, codes=codes, assay_purpose=assay_purpose, include=include, sort=sort, order=order, page_size=page_size, # Get full page size for each term page=page, api_key=api_key, ) # Add unique biomarkers (deduplicate by ID) for biomarker in results.get("biomarkers", []): bio_id = biomarker.get("id", biomarker.get("biomarker_id")) if bio_id and bio_id not in all_biomarkers: all_biomarkers[bio_id] = biomarker total_found += results.get("total", 0) except Exception as e: logger.warning(f"Failed to search for term '{term}': {e}") # Continue with other terms # Convert back to list and apply pagination unique_biomarkers = list(all_biomarkers.values()) # Sort if requested (by name by default for consistent results) if sort == "name" or sort is None: unique_biomarkers.sort(key=lambda x: x.get("name", "").lower()) # Apply pagination to combined results start_idx = (page - 1) * page_size end_idx = start_idx + page_size paginated_biomarkers = unique_biomarkers[start_idx:end_idx] return { "biomarkers": paginated_biomarkers, "total": len(unique_biomarkers), "page": page, "page_size": page_size, "search_terms": search_terms, # Include what we searched for "total_found_across_terms": total_found, # Total before deduplication } def format_biomarker_results(results: dict[str, Any]) -> str: """ Format biomarker search results as markdown. Args: results: Search results dictionary Returns: Formatted markdown string """ biomarkers = results.get("biomarkers", []) total = results.get("total", 0) note = results.get("note", "") if not biomarkers: msg = "No biomarkers found matching the search criteria." if note: msg += f"\n\n*Note: {note}*" return msg # Build markdown output lines = _format_biomarker_header(total, note) for biomarker in biomarkers: lines.extend(_format_single_biomarker(biomarker)) return "\n".join(lines) ``` -------------------------------------------------------------------------------- /docs/tutorials/nci-prompts.md: -------------------------------------------------------------------------------- ```markdown # NCI Tools Example Prompts This guide provides example prompts for AI assistants to effectively use the NCI (National Cancer Institute) Clinical Trials Search API tools in BioMCP. ## Overview of NCI Tools BioMCP integrates with the NCI Clinical Trials Search API to provide: - **Organization Search & Lookup** - Find cancer research centers, hospitals, and trial sponsors - **Intervention Search & Lookup** - Search for drugs, devices, procedures, and other interventions These tools require an NCI API key from: https://clinicaltrialsapi.cancer.gov/ ## Best Practices ### API Key Required All example prompts in this guide should include your NCI API key. Add this to the end of each prompt: ``` "... my NCI API key is YOUR_API_KEY" ``` ### Location Searches **ALWAYS use city AND state together** when searching organizations by location. The NCI API has Elasticsearch limitations that cause errors with broad searches. ✅ **Good**: `nci_organization_searcher(city="Cleveland", state="OH")` ❌ **Bad**: `nci_organization_searcher(city="Cleveland")` or `nci_organization_searcher(state="OH")` ### API Parameter Notes - The NCI APIs do not support offset-based pagination (`from` parameter) - Organization location parameters use `org_` prefix (e.g., `org_city`, `org_state_or_province`) - When using `size` parameter, the API may not return a `total` count ### Avoiding API Errors - Use specific organization names when possible - Combine multiple filters (name + type, city + state) - Start with more specific searches, then broaden if needed ## Organization Tools ### Organization Search #### Basic Organization Search ``` "Find cancer centers in California, my NCI API key is YOUR_API_KEY" "Search for MD Anderson Cancer Center, my NCI API key is YOUR_API_KEY" "List academic cancer research centers in New York, my NCI API key is YOUR_API_KEY" "Find all NCI-designated cancer centers, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_organization_searcher(state="CA", organization_type="Academic")` #### Organization by Location **IMPORTANT**: Always use city AND state together to avoid API errors! ``` "Show me cancer treatment centers in Boston, MA, my NCI API key is YOUR_API_KEY" "Find clinical trial sites in Houston, Texas, my NCI API key is YOUR_API_KEY" "List all cancer research organizations in Cleveland, OH, my NCI API key is YOUR_API_KEY" "Search for industry sponsors in San Francisco, CA, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_organization_searcher(city="Boston", state="MA")` ✓ **Never use**: `nci_organization_searcher(city="Boston")` ✗ or `nci_organization_searcher(state="MA")` ✗ #### Organization by Type ``` "Find all government cancer research facilities, my NCI API key is YOUR_API_KEY" "List pharmaceutical companies running cancer trials, my NCI API key is YOUR_API_KEY" "Show me academic medical centers conducting trials, my NCI API key is YOUR_API_KEY" "Find community hospitals participating in cancer research, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_organization_searcher(organization_type="Industry")` ### Organization Details ``` "Get details about organization NCI-2011-03337, my NCI API key is YOUR_API_KEY" "Show me contact information for this cancer center, my NCI API key is YOUR_API_KEY" "What trials is this organization conducting? My NCI API key is YOUR_API_KEY" "Give me the full profile of this research institution, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `organization_getter(organization_id="NCI-2011-03337")` ## Intervention Tools ### Intervention Search #### Drug Search ``` "Find all trials using pembrolizumab, my NCI API key is YOUR_API_KEY" "Search for PD-1 inhibitor drugs in trials, my NCI API key is YOUR_API_KEY" "List all immunotherapy drugs being tested, my NCI API key is YOUR_API_KEY" "Find trials using Keytruda or similar drugs, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_intervention_searcher(name="pembrolizumab", intervention_type="Drug")` #### Device Search ``` "Search for medical devices in cancer trials, my NCI API key is YOUR_API_KEY" "Find trials using surgical robots, my NCI API key is YOUR_API_KEY" "List radiation therapy devices being tested, my NCI API key is YOUR_API_KEY" "Show me trials with diagnostic devices, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_intervention_searcher(intervention_type="Device")` #### Procedure Search ``` "Find surgical procedures in cancer trials, my NCI API key is YOUR_API_KEY" "Search for minimally invasive surgery trials, my NCI API key is YOUR_API_KEY" "List trials with radiation therapy procedures, my NCI API key is YOUR_API_KEY" "Show me trials testing new biopsy techniques, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_intervention_searcher(intervention_type="Procedure")` #### Other Interventions ``` "Find behavioral interventions for cancer patients, my NCI API key is YOUR_API_KEY" "Search for dietary interventions in trials, my NCI API key is YOUR_API_KEY" "List genetic therapy trials, my NCI API key is YOUR_API_KEY" "Show me trials with exercise interventions, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_intervention_searcher(intervention_type="Behavioral")` ### Intervention Details ``` "Get full details about intervention INT123456, my NCI API key is YOUR_API_KEY" "Show me the mechanism of action for this drug, my NCI API key is YOUR_API_KEY" "Is this intervention FDA approved? My NCI API key is YOUR_API_KEY" "What trials are using this intervention? My NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `intervention_getter(intervention_id="INT123456")` ## Biomarker Tools ### Biomarker Search #### Basic Biomarker Search ``` "Find PD-L1 expression biomarkers, my NCI API key is YOUR_API_KEY" "Search for EGFR mutations used in trials, my NCI API key is YOUR_API_KEY" "List biomarkers tested by IHC, my NCI API key is YOUR_API_KEY" "Find HER2 positive biomarkers, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_biomarker_searcher(name="PD-L1")` #### Biomarker by Type ``` "Show me all reference gene biomarkers, my NCI API key is YOUR_API_KEY" "Find branch biomarkers, my NCI API key is YOUR_API_KEY" "List all biomarkers of type reference_gene, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_biomarker_searcher(biomarker_type="reference_gene")` #### Important Note on Biomarker Types The NCI API only supports two biomarker types: - `reference_gene`: Gene-based biomarkers - `branch`: Branch/pathway biomarkers Note: The API does NOT support searching by gene symbol or assay type directly. ## NCI Disease Tools ### Disease Search #### Basic Disease Search ``` "Find melanoma in NCI vocabulary, my NCI API key is YOUR_API_KEY" "Search for lung cancer types, my NCI API key is YOUR_API_KEY" "List breast cancer subtypes, my NCI API key is YOUR_API_KEY" "Find official name for GIST, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_disease_searcher(name="melanoma")` #### Disease with Synonyms ``` "Find all names for gastrointestinal stromal tumor, my NCI API key is YOUR_API_KEY" "Search for NSCLC and all its synonyms, my NCI API key is YOUR_API_KEY" "List all terms for triple-negative breast cancer, my NCI API key is YOUR_API_KEY" "Find alternative names for melanoma, my NCI API key is YOUR_API_KEY" ``` **Expected tool usage**: `nci_disease_searcher(name="GIST", include_synonyms=True)` ## Combined Workflows ### Finding Trials at Specific Centers ``` "First find cancer centers in California, then show me their trials, my NCI API key is YOUR_API_KEY" ``` **Expected workflow**: 1. `nci_organization_searcher(state="CA")` 2. For each organization, search trials with that sponsor ### Drug Development Pipeline ``` "Search for CAR-T cell therapies and show me which organizations are developing them, my NCI API key is YOUR_API_KEY" ``` **Expected workflow**: 1. `nci_intervention_searcher(name="CAR-T", intervention_type="Biological")` 2. For each intervention, get details to see associated trials 3. Extract organization information from trial data ### Regional Cancer Research ``` "What cancer drugs are being tested in Boston area hospitals? My NCI API key is YOUR_API_KEY" ``` **Expected workflow**: 1. `nci_organization_searcher(city="Boston", state="MA")` 2. `trial_searcher(location="Boston, MA", source="nci")` with organization filters 3. Extract intervention information from trials ## Important Notes ### API Key Handling All NCI tools require an API key. The tools will check for: 1. API key provided in the function call 2. `NCI_API_KEY` environment variable 3. User-provided key in their message (e.g., "my NCI API key is...") ### Synonym Support The intervention searcher includes a `synonyms` parameter (default: True) that will search for: - Drug trade names (e.g., "Keytruda" finds "pembrolizumab") - Alternative spellings - Related terms ### Pagination Both search tools support pagination: - `page`: Page number (1-based) - `page_size`: Results per page (max 100) ### Organization Types Valid organization types include: - Academic - Industry - Government - Community - Network - Other ### Intervention Types Valid intervention types include: - Drug - Device - Biological - Procedure - Radiation - Behavioral - Genetic - Dietary - Other ## Error Handling Common errors and solutions: 1. **"NCI API key required"**: User needs to provide an API key 2. **"No results found"**: Try broader search terms or remove filters 3. **"Invalid organization/intervention ID"**: Verify the ID format 4. **Rate limiting**: The API has rate limits; wait before retrying 5. **"Search Too Broad" (Elasticsearch error)**: The search returns too many results - This happens when searching with broad criteria - **Prevention**: Always use city AND state together for location searches - Add organization name (even partial) to narrow results - Avoid searching by state alone or organization type alone ``` -------------------------------------------------------------------------------- /src/biomcp/interventions/search.py: -------------------------------------------------------------------------------- ```python """Search functionality for interventions via NCI CTS API.""" import logging from typing import Any from ..constants import NCI_INTERVENTIONS_URL from ..integrations.cts_api import CTSAPIError, make_cts_request from ..utils import parse_or_query logger = logging.getLogger(__name__) # Intervention types based on ClinicalTrials.gov categories INTERVENTION_TYPES = [ "Drug", "Device", "Biological", "Procedure", "Radiation", "Behavioral", "Genetic", "Dietary", "Diagnostic Test", "Other", ] def _build_intervention_params( name: str | None, intervention_type: str | None, category: str | None, codes: list[str] | None, include: list[str] | None, sort: str | None, order: str | None, page_size: int | None, ) -> dict[str, Any]: """Build query parameters for intervention search.""" params: dict[str, Any] = {} if name: params["name"] = name if intervention_type: params["type"] = intervention_type.lower() if category: params["category"] = category if codes: params["codes"] = ",".join(codes) if isinstance(codes, list) else codes if include: params["include"] = ( ",".join(include) if isinstance(include, list) else include ) if sort: params["sort"] = sort if order: params["order"] = order.lower() # Only add size if explicitly requested and > 0 if page_size and page_size > 0: params["size"] = page_size return params def _process_intervention_response( response: Any, page: int, page_size: int | None, ) -> dict[str, Any]: """Process intervention search response.""" if isinstance(response, dict): # Standard response format from the API interventions = response.get("data", []) # When size parameter is used, API doesn't return 'total' total = response.get("total", len(interventions)) elif isinstance(response, list): # Direct list of interventions interventions = response total = len(interventions) else: # Unexpected response format logger.warning(f"Unexpected response type: {type(response)}") interventions = [] total = 0 return { "interventions": interventions, "total": total, "page": page, "page_size": page_size, } async def search_interventions( name: str | None = None, intervention_type: str | None = None, category: str | None = None, codes: list[str] | None = None, include: list[str] | None = None, sort: str | None = None, order: str | None = None, synonyms: bool = True, # Kept for backward compatibility but ignored page_size: int | None = None, page: int = 1, api_key: str | None = None, ) -> dict[str, Any]: """ Search for interventions in the NCI CTS database. Args: name: Intervention name to search for (partial match) intervention_type: Type of intervention (Drug, Device, Procedure, etc.) category: Category filter (agent, agent category, other) codes: List of intervention codes to search for (e.g., ["C82416", "C171257"]) include: Fields to include in response (all fields, name, category, codes, etc.) sort: Sort field (default: 'name', also supports 'count') order: Sort order ('asc' or 'desc', required when using sort) synonyms: [Deprecated] Kept for backward compatibility but ignored page_size: Number of results per page (when used, 'total' field not returned) page: Page number (Note: API doesn't support offset pagination) api_key: Optional API key (if not provided, uses NCI_API_KEY env var) Returns: Dictionary with search results containing: - interventions: List of intervention records - total: Total number of results (only when size not specified) - page: Current page - page_size: Results per page Raises: CTSAPIError: If the API request fails """ # Build query parameters params = _build_intervention_params( name, intervention_type, category, codes, include, sort, order, page_size, ) logger.info( f"Searching interventions at {NCI_INTERVENTIONS_URL} with params: {params}" ) try: # Make API request response = await make_cts_request( url=NCI_INTERVENTIONS_URL, params=params, api_key=api_key, ) # Log response info logger.debug(f"Response type: {type(response)}") # Process response return _process_intervention_response(response, page, page_size) except CTSAPIError: raise except Exception as e: logger.error(f"Failed to search interventions: {e}") raise CTSAPIError(f"Intervention search failed: {e!s}") from e def format_intervention_results(results: dict[str, Any]) -> str: """ Format intervention search results as markdown. Args: results: Search results dictionary Returns: Formatted markdown string """ interventions = results.get("interventions", []) total = results.get("total", 0) if not interventions: return "No interventions found matching the search criteria." # Build markdown output actual_count = len(interventions) if actual_count < total: lines = [ f"## Intervention Search Results (showing {actual_count} of {total} found)", "", ] else: lines = [ f"## Intervention Search Results ({total} found)", "", ] for intervention in interventions: int_id = intervention.get( "id", intervention.get("intervention_id", "Unknown") ) name = intervention.get("name", "Unknown Intervention") int_type = intervention.get( "type", intervention.get("category", "Unknown") ) lines.append(f"### {name}") lines.append(f"- **ID**: {int_id}") lines.append(f"- **Type**: {int_type}") # Add synonyms if available synonyms = intervention.get("synonyms", []) if synonyms: if isinstance(synonyms, list): lines.append(f"- **Synonyms**: {', '.join(synonyms[:5])}") if len(synonyms) > 5: lines.append(f" *(and {len(synonyms) - 5} more)*") elif isinstance(synonyms, str): lines.append(f"- **Synonyms**: {synonyms}") # Add description if available if intervention.get("description"): desc = intervention["description"] if len(desc) > 200: desc = desc[:197] + "..." lines.append(f"- **Description**: {desc}") lines.append("") return "\n".join(lines) async def search_interventions_with_or( name_query: str, intervention_type: str | None = None, category: str | None = None, codes: list[str] | None = None, include: list[str] | None = None, sort: str | None = None, order: str | None = None, synonyms: bool = True, page_size: int | None = None, page: int = 1, api_key: str | None = None, ) -> dict[str, Any]: """ Search for interventions with OR query support. This function handles OR queries by making multiple API calls and combining results. For example: "pembrolizumab OR nivolumab" will search for each term. Args: name_query: Name query that may contain OR operators Other args same as search_interventions Returns: Combined results from all searches with duplicates removed """ # Check if this is an OR query if " OR " in name_query or " or " in name_query: search_terms = parse_or_query(name_query) logger.info(f"Parsed OR query into terms: {search_terms}") else: # Single term search search_terms = [name_query] # Collect all unique interventions all_interventions = {} total_found = 0 # Search for each term for term in search_terms: logger.info(f"Searching interventions for term: {term}") try: results = await search_interventions( name=term, intervention_type=intervention_type, category=category, codes=codes, include=include, sort=sort, order=order, synonyms=synonyms, page_size=page_size, page=page, api_key=api_key, ) # Add unique interventions (deduplicate by ID) for intervention in results.get("interventions", []): int_id = intervention.get( "id", intervention.get("intervention_id") ) if int_id and int_id not in all_interventions: all_interventions[int_id] = intervention total_found += results.get("total", 0) except Exception as e: logger.warning(f"Failed to search for term '{term}': {e}") # Continue with other terms # Convert back to list and apply pagination unique_interventions = list(all_interventions.values()) # Sort by name for consistent results unique_interventions.sort(key=lambda x: x.get("name", "").lower()) # Apply pagination to combined results if page_size: start_idx = (page - 1) * page_size end_idx = start_idx + page_size paginated_interventions = unique_interventions[start_idx:end_idx] else: paginated_interventions = unique_interventions return { "interventions": paginated_interventions, "total": len(unique_interventions), "page": page, "page_size": page_size, "search_terms": search_terms, # Include what we searched for "total_found_across_terms": total_found, # Total before deduplication } ``` -------------------------------------------------------------------------------- /docs/developer-guides/01-server-deployment.md: -------------------------------------------------------------------------------- ```markdown # Server Deployment Guide This guide covers various deployment options for BioMCP, from local development to production cloud deployments with authentication. ## Deployment Options Overview | Mode | Use Case | Transport | Authentication | Scalability | | --------------------- | ------------- | --------------- | -------------- | ----------- | | **Local STDIO** | Development | STDIO | None | Single user | | **HTTP Server** | Small teams | Streamable HTTP | Optional | Moderate | | **Docker** | Containerized | Streamable HTTP | Optional | Moderate | | **Cloudflare Worker** | Production | SSE/HTTP | OAuth optional | High | ## Local Development (STDIO) The simplest deployment for development and testing. ### Setup ```bash # Install BioMCP uv tool install biomcp # Run in STDIO mode (default) biomcp run ``` ### Configuration For Claude Desktop integration: ```json { "mcpServers": { "biomcp": { "command": "biomcp", "args": ["run"] } } } ``` ### Use Cases - Local development - Single-user research - Testing new features ## HTTP Server Deployment Modern deployment using Streamable HTTP transport. ### Basic Setup ```bash # Run HTTP server biomcp run --mode http --host 0.0.0.0 --port 8000 ``` ### With Environment Variables ```bash # Create .env file cat > .env << EOF BIOMCP_HOST=0.0.0.0 BIOMCP_PORT=8000 NCI_API_KEY=your-key ALPHAGENOME_API_KEY=your-key EOF # Run with env file biomcp run --mode http ``` ### Systemd Service (Linux) Create `/etc/systemd/system/biomcp.service`: ```ini [Unit] Description=BioMCP Server After=network.target [Service] Type=simple User=biomcp WorkingDirectory=/opt/biomcp Environment="PATH=/usr/local/bin:/usr/bin" EnvironmentFile=/opt/biomcp/.env ExecStart=/usr/local/bin/biomcp run --mode http Restart=always RestartSec=10 [Install] WantedBy=multi-user.target ``` Enable and start: ```bash sudo systemctl enable biomcp sudo systemctl start biomcp ``` ### Nginx Reverse Proxy ```nginx server { listen 443 ssl; server_name biomcp.example.com; ssl_certificate /etc/ssl/certs/biomcp.crt; ssl_certificate_key /etc/ssl/private/biomcp.key; location /mcp { proxy_pass http://localhost:8000; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_buffering off; } } ``` ## Docker Deployment Containerized deployment for consistency and portability. ### Basic Dockerfile ```dockerfile FROM python:3.11-slim # Install BioMCP RUN pip install biomcp-python # Add API keys (use secrets in production!) ENV NCI_API_KEY="" ENV ALPHAGENOME_API_KEY="" # Expose port EXPOSE 8000 # Run server CMD ["biomcp", "run", "--mode", "http", "--host", "0.0.0.0"] ``` ### With AlphaGenome Support ```dockerfile FROM python:3.11-slim # Install system dependencies RUN apt-get update && apt-get install -y git # Install BioMCP RUN pip install biomcp-python # Install AlphaGenome RUN git clone https://github.com/google-deepmind/alphagenome.git && \ cd alphagenome && \ pip install . # Configure ENV MCP_MODE=http ENV BIOMCP_HOST=0.0.0.0 ENV BIOMCP_PORT=8000 EXPOSE 8000 CMD ["biomcp", "run"] ``` ### Docker Compose ```yaml version: "3.8" services: biomcp: build: . ports: - "8000:8000" environment: - MCP_MODE=http - NCI_API_KEY=${NCI_API_KEY} - ALPHAGENOME_API_KEY=${ALPHAGENOME_API_KEY} volumes: - ./logs:/app/logs restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 ``` ### Running ```bash # Build and run docker-compose up -d # View logs docker-compose logs -f # Scale horizontally docker-compose up -d --scale biomcp=3 ``` ## Cloudflare Worker Deployment Enterprise-grade deployment with global edge distribution. ### Prerequisites 1. Cloudflare account 2. Wrangler CLI installed 3. Remote BioMCP server running ### Architecture ``` Claude Desktop → Cloudflare Worker (Edge) → BioMCP Server (Origin) ``` ### Setup Worker 1. **Install dependencies:** ```bash npm install @modelcontextprotocol/sdk itty-router ``` 2. **Create `wrangler.toml`:** ```toml name = "biomcp-worker" main = "src/index.js" compatibility_date = "2024-01-01" [vars] REMOTE_MCP_SERVER_URL = "https://your-biomcp-server.com/mcp" MCP_SERVER_API_KEY = "your-secret-key" [[kv_namespaces]] binding = "AUTH_TOKENS" id = "your-kv-namespace-id" ``` 3. **Deploy:** ```bash wrangler deploy ``` ### With OAuth Authentication (Stytch) 1. **Configure Stytch:** ```toml [vars] STYTCH_PROJECT_ID = "project-test-..." STYTCH_SECRET = "secret-test-..." STYTCH_PUBLIC_TOKEN = "public-token-test-..." JWT_SECRET = "your-jwt-secret" ``` 2. **OAuth Endpoints:** The worker automatically provides: - `/.well-known/oauth-authorization-server` - `/authorize` - `/callback` - `/token` 3. **Client Configuration:** ```json { "mcpServers": { "biomcp": { "transport": { "type": "sse", "url": "https://your-worker.workers.dev" }, "auth": { "type": "oauth", "client_id": "mcp-client", "authorization_endpoint": "https://your-worker.workers.dev/authorize", "token_endpoint": "https://your-worker.workers.dev/token", "scope": "mcp:access" } } } } ``` ## Production Considerations ### Security 1. **API Key Management:** ```bash # Use environment variables export NCI_API_KEY="$(vault kv get -field=key secret/biomcp/nci)" # Or use secrets management docker run --secret biomcp_keys biomcp:latest ``` 2. **Network Security:** - Use HTTPS everywhere - Implement rate limiting - Set up CORS properly - Use authentication for public endpoints 3. **Access Control:** ```python # Example middleware async def auth_middleware(request, call_next): token = request.headers.get("Authorization") if not validate_token(token): return JSONResponse({"error": "Unauthorized"}, status_code=401) return await call_next(request) ``` ### Monitoring 1. **Health Checks:** ```python # Built-in health endpoint GET /health # Custom health check @app.get("/health/detailed") async def health_detailed(): return { "status": "healthy", "version": __version__, "apis": check_api_status(), "timestamp": datetime.utcnow() } ``` 2. **Metrics:** ```python # Prometheus metrics from prometheus_client import Counter, Histogram request_count = Counter('biomcp_requests_total', 'Total requests') request_duration = Histogram('biomcp_request_duration_seconds', 'Request duration') ``` 3. **Logging:** ```python # Structured logging import structlog logger = structlog.get_logger() logger.info("request_processed", tool="article_searcher", duration=0.234, user_id="user123" ) ``` ### Scaling 1. **Horizontal Scaling:** ```yaml # Kubernetes deployment apiVersion: apps/v1 kind: Deployment metadata: name: biomcp spec: replicas: 3 selector: matchLabels: app: biomcp template: metadata: labels: app: biomcp spec: containers: - name: biomcp image: biomcp:latest ports: - containerPort: 8000 resources: requests: memory: "512Mi" cpu: "500m" limits: memory: "1Gi" cpu: "1000m" ``` 2. **Caching:** ```python # Redis caching import redis from functools import wraps redis_client = redis.Redis() def cache_result(ttl=3600): def decorator(func): @wraps(func) async def wrapper(*args, **kwargs): key = f"{func.__name__}:{str(args)}:{str(kwargs)}" cached = redis_client.get(key) if cached: return json.loads(cached) result = await func(*args, **kwargs) redis_client.setex(key, ttl, json.dumps(result)) return result return wrapper return decorator ``` ### Performance Optimization 1. **Connection Pooling:** ```python # Reuse HTTP connections import httpx client = httpx.AsyncClient( limits=httpx.Limits(max_keepalive_connections=20), timeout=httpx.Timeout(30.0) ) ``` 2. **Async Processing:** ```python # Process requests concurrently async def handle_batch(requests): tasks = [process_request(req) for req in requests] return await asyncio.gather(*tasks) ``` 3. **Response Compression:** ```python # Enable gzip compression from fastapi.middleware.gzip import GZipMiddleware app.add_middleware(GZipMiddleware, minimum_size=1000) ``` ## Migration Path ### From STDIO to HTTP 1. Update server startup: ```bash # Old biomcp run # New biomcp run --mode http ``` 2. Update client configuration: ```json { "mcpServers": { "biomcp": { "url": "http://localhost:8000/mcp" } } } ``` ### From SSE to Streamable HTTP 1. Update worker code to use `/mcp` endpoint 2. Update client to use new transport: ```json { "transport": { "type": "http", "url": "https://biomcp.example.com/mcp" } } ``` ## Troubleshooting ### Common Issues 1. **Port Already in Use:** ```bash # Find process using port lsof -i :8000 # Kill process kill -9 <PID> ``` 2. **API Key Errors:** ```bash # Verify environment variables env | grep -E "(NCI|ALPHAGENOME|CBIO)" # Test API key curl -H "X-API-KEY: $NCI_API_KEY" https://api.cancer.gov/v2/trials ``` 3. **Connection Timeouts:** - Increase timeout values - Check firewall rules - Verify network connectivity ### Debug Mode ```bash # Enable debug logging BIOMCP_LOG_LEVEL=DEBUG biomcp run --mode http # Or in Docker docker run -e BIOMCP_LOG_LEVEL=DEBUG biomcp:latest ``` ## Next Steps - Set up [monitoring](../how-to-guides/05-logging-and-monitoring-with-bigquery.md) - Configure [authentication](../getting-started/03-authentication-and-api-keys.md) - Review [security policies](../policies.md) - Implement [CI/CD pipeline](02-contributing-and-testing.md) ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/utils.py: -------------------------------------------------------------------------------- ```python """ Utility functions for OpenFDA API integration. """ import asyncio import logging import os from typing import Any from ..http_client import request_api from .cache import ( get_cached_response, is_cacheable_request, set_cached_response, ) from .exceptions import ( OpenFDAConnectionError, OpenFDARateLimitError, OpenFDATimeoutError, OpenFDAValidationError, ) from .input_validation import build_safe_query from .rate_limiter import FDA_CIRCUIT_BREAKER, FDA_RATE_LIMITER, FDA_SEMAPHORE from .validation import sanitize_response, validate_fda_response logger = logging.getLogger(__name__) def get_api_key() -> str | None: """Get OpenFDA API key from environment variable.""" api_key = os.environ.get("OPENFDA_API_KEY") if not api_key: logger.debug("No OPENFDA_API_KEY found in environment") return api_key async def make_openfda_request( # noqa: C901 endpoint: str, params: dict[str, Any], domain: str = "openfda", api_key: str | None = None, max_retries: int = 3, initial_delay: float = 1.0, ) -> tuple[dict[str, Any] | None, str | None]: """ Make a request to the OpenFDA API with retry logic and caching. Args: endpoint: Full URL to the OpenFDA endpoint params: Query parameters domain: Domain name for metrics tracking api_key: Optional API key (overrides environment variable) max_retries: Maximum number of retry attempts (default 3) initial_delay: Initial delay in seconds for exponential backoff (default 1.0) Returns: Tuple of (response_data, error_message) """ # Validate and sanitize input parameters safe_params = build_safe_query(params) # Check cache first (with safe params) if is_cacheable_request(endpoint, safe_params): cached_response = get_cached_response(endpoint, safe_params) if cached_response: return cached_response, None # Use provided API key or get from environment if not api_key: api_key = get_api_key() if api_key: safe_params["api_key"] = api_key last_error = None delay = initial_delay for attempt in range(max_retries + 1): try: # Apply rate limiting and circuit breaker async with FDA_SEMAPHORE: await FDA_RATE_LIMITER.acquire() # Check circuit breaker state if FDA_CIRCUIT_BREAKER.is_open: state = FDA_CIRCUIT_BREAKER.get_state() return None, f"FDA API circuit breaker is open: {state}" response, error = await request_api( url=endpoint, request=safe_params, method="GET", domain=domain, ) if error: error_msg = ( error.message if hasattr(error, "message") else str(error) ) # Check for specific error types if "429" in error_msg or "rate limit" in error_msg.lower(): if attempt < max_retries: logger.warning( f"Rate limit hit (attempt {attempt + 1}/{max_retries + 1}). " f"Retrying in {delay:.1f} seconds..." ) await asyncio.sleep(delay) delay *= 2 # Exponential backoff continue else: raise OpenFDARateLimitError(error_msg) # Check if error is retryable if _is_retryable_error(error_msg) and attempt < max_retries: logger.warning( f"OpenFDA API error (attempt {attempt + 1}/{max_retries + 1}): {error_msg}. " f"Retrying in {delay:.1f} seconds..." ) await asyncio.sleep(delay) delay *= 2 # Exponential backoff continue logger.error(f"OpenFDA API error: {error_msg}") return None, error_msg # Validate and sanitize response if response: try: validate_fda_response(response, response_type="search") response = sanitize_response(response) except OpenFDAValidationError as e: logger.error(f"Invalid FDA response: {e}") return None, str(e) # Cache successful response if is_cacheable_request(endpoint, safe_params): set_cached_response(endpoint, safe_params, response) return response, None except asyncio.TimeoutError: last_error = "Request timeout" if attempt < max_retries: logger.warning( f"OpenFDA request timeout (attempt {attempt + 1}/{max_retries + 1}). " f"Retrying in {delay:.1f} seconds..." ) await asyncio.sleep(delay) delay *= 2 continue logger.error( f"OpenFDA request failed after {max_retries + 1} attempts: {last_error}" ) raise OpenFDATimeoutError(last_error) from None except ConnectionError as e: last_error = f"Connection error: {e}" if attempt < max_retries: logger.warning( f"OpenFDA connection error (attempt {attempt + 1}/{max_retries + 1}): {e}. " f"Retrying in {delay:.1f} seconds..." ) await asyncio.sleep(delay) delay *= 2 continue logger.error( f"OpenFDA request failed after {max_retries + 1} attempts: {last_error}" ) raise OpenFDAConnectionError(last_error) from None except ( OpenFDARateLimitError, OpenFDATimeoutError, OpenFDAConnectionError, ): # Re-raise our custom exceptions raise except Exception as e: # Handle unexpected errors gracefully logger.error(f"Unexpected OpenFDA request error: {e}") return None, str(e) return None, last_error def _is_retryable_error(error_msg: str) -> bool: """ Check if an error is retryable. Args: error_msg: Error message string Returns: True if the error is retryable """ retryable_patterns = [ "rate limit", "timeout", "connection", "503", # Service unavailable "502", # Bad gateway "504", # Gateway timeout "429", # Too many requests "temporary", "try again", ] error_lower = error_msg.lower() return any(pattern in error_lower for pattern in retryable_patterns) def format_count(count: int, label: str) -> str: """Format a count with appropriate singular/plural label.""" if count == 1: return f"1 {label}" return f"{count:,} {label}s" def truncate_text(text: str, max_length: int = 500) -> str: """Truncate text to a maximum length with ellipsis.""" if len(text) <= max_length: return text return text[: max_length - 3] + "..." def clean_text(text: str | None) -> str: """Clean and normalize text from FDA data.""" if not text: return "" # Remove extra whitespace and newlines text = " ".join(text.split()) # Remove common FDA formatting artifacts text = text.replace("\\n", " ") text = text.replace("\\r", " ") text = text.replace("\\t", " ") return text.strip() def build_search_query( field_map: dict[str, str], operator: str = "AND" ) -> str: """ Build an OpenFDA search query from field mappings. Args: field_map: Dictionary mapping field names to search values operator: Logical operator (AND/OR) to combine fields Returns: Formatted search query string """ query_parts = [] for field, value in field_map.items(): if value: # Escape special characters escaped_value = value.replace('"', '\\"') # Add quotes for multi-word values if " " in escaped_value: escaped_value = f'"{escaped_value}"' query_parts.append(f"{field}:{escaped_value}") return f" {operator} ".join(query_parts) def extract_drug_names(result: dict[str, Any]) -> list[str]: """Extract drug names from an OpenFDA result.""" drug_names = set() # Check patient drug info (for adverse events) if "patient" in result: drugs = result.get("patient", {}).get("drug", []) for drug in drugs: if "medicinalproduct" in drug: drug_names.add(drug["medicinalproduct"]) # Check OpenFDA fields openfda = drug.get("openfda", {}) if "brand_name" in openfda: drug_names.update(openfda["brand_name"]) if "generic_name" in openfda: drug_names.update(openfda["generic_name"]) # Check direct OpenFDA fields (for labels) if "openfda" in result: openfda = result["openfda"] if "brand_name" in openfda: drug_names.update(openfda["brand_name"]) if "generic_name" in openfda: drug_names.update(openfda["generic_name"]) return sorted(drug_names) def extract_reactions(result: dict[str, Any]) -> list[str]: """Extract reaction terms from an adverse event result.""" reactions = [] patient = result.get("patient", {}) reaction_list = patient.get("reaction", []) for reaction in reaction_list: if "reactionmeddrapt" in reaction: reactions.append(reaction["reactionmeddrapt"]) return reactions def format_drug_list(drugs: list[str], max_items: int = 5) -> str: """Format a list of drug names for display.""" if not drugs: return "None specified" if len(drugs) <= max_items: return ", ".join(drugs) shown = drugs[:max_items] remaining = len(drugs) - max_items return f"{', '.join(shown)} (+{remaining} more)" ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/drug_recalls.py: -------------------------------------------------------------------------------- ```python """ OpenFDA drug recalls (Enforcement) integration. """ import logging from typing import Any from .constants import ( OPENFDA_DEFAULT_LIMIT, OPENFDA_DISCLAIMER, OPENFDA_DRUG_ENFORCEMENT_URL, ) from .drug_recalls_helpers import ( build_recall_search_params, ) from .utils import ( clean_text, format_count, make_openfda_request, truncate_text, ) logger = logging.getLogger(__name__) async def search_drug_recalls( drug: str | None = None, recall_class: str | None = None, status: str | None = None, reason: str | None = None, since_date: str | None = None, limit: int = OPENFDA_DEFAULT_LIMIT, skip: int = 0, api_key: str | None = None, ) -> str: """ Search FDA drug recall records from Enforcement database. Args: drug: Drug name (brand or generic) to search for recall_class: Classification (1, 2, or 3) status: Recall status (ongoing, completed, terminated) reason: Search text in recall reason since_date: Only show recalls after this date (YYYYMMDD format) limit: Maximum number of results to return skip: Number of results to skip (for pagination) api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) Returns: Formatted string with drug recall information """ # Build search parameters search_params = build_recall_search_params( drug, recall_class, status, reason, since_date, limit, skip ) # Make the request response, error = await make_openfda_request( OPENFDA_DRUG_ENFORCEMENT_URL, search_params, "openfda_recalls", api_key ) if error: return f"⚠️ Error searching drug recalls: {error}" if not response or not response.get("results"): return "No drug recall records found matching your criteria." # Format the results results = response["results"] total = ( response.get("meta", {}).get("results", {}).get("total", len(results)) ) output = ["## FDA Drug Recall Records\n"] if drug: output.append(f"**Drug**: {drug}") if recall_class: output.append(f"**Classification**: Class {recall_class}") if status: output.append(f"**Status**: {status}") if since_date: output.append(f"**Since**: {since_date}") output.append( f"**Total Recalls Found**: {format_count(total, 'recall')}\n" ) # Summary of recall classes if multiple results if len(results) > 1: output.extend(_format_recall_class_summary(results)) # Show results output.append(f"### Recalls (showing {len(results)} of {total}):\n") for i, recall in enumerate(results, 1): output.extend(_format_recall_summary(recall, i)) output.append(f"\n{OPENFDA_DISCLAIMER}") return "\n".join(output) async def get_drug_recall( recall_number: str, api_key: str | None = None, ) -> str: """ Get detailed drug recall information for a specific recall. Args: recall_number: FDA recall number api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) Returns: Formatted string with detailed recall information """ # Search for the specific recall search_params = {"search": f'recall_number:"{recall_number}"', "limit": 1} response, error = await make_openfda_request( OPENFDA_DRUG_ENFORCEMENT_URL, search_params, "openfda_recalls", api_key ) if error: return f"⚠️ Error retrieving drug recall: {error}" if not response or not response.get("results"): return f"No recall record found for {recall_number}" recall = response["results"][0] # Format detailed recall information output = [f"## Drug Recall Details: {recall_number}\n"] # Basic information output.extend(_format_recall_header(recall)) # Reason and details output.extend(_format_recall_details(recall)) # Distribution information output.extend(_format_distribution_info(recall)) # OpenFDA metadata if openfda := recall.get("openfda"): output.extend(_format_recall_openfda(openfda)) output.append(f"\n{OPENFDA_DISCLAIMER}") return "\n".join(output) def _format_recall_class_summary(results: list[dict[str, Any]]) -> list[str]: """Format summary of recall classifications.""" output = [] # Count by classification class_counts = {"Class I": 0, "Class II": 0, "Class III": 0} for recall in results: classification = recall.get("classification", "") if classification in class_counts: class_counts[classification] += 1 if any(class_counts.values()): output.append("### Classification Summary:") if class_counts["Class I"]: output.append( f"- **Class I** (most serious): {class_counts['Class I']} recalls" ) if class_counts["Class II"]: output.append( f"- **Class II** (moderate): {class_counts['Class II']} recalls" ) if class_counts["Class III"]: output.append( f"- **Class III** (least serious): {class_counts['Class III']} recalls" ) output.append("") return output def _format_recall_summary(recall: dict[str, Any], num: int) -> list[str]: """Format a single recall summary.""" output = [f"#### {num}. Recall {recall.get('recall_number', 'Unknown')}"] # Classification and status classification = recall.get("classification", "Unknown") status = recall.get("status", "Unknown") # Add severity indicator severity_emoji = { "Class I": "🔴", # Most serious "Class II": "🟡", # Moderate "Class III": "🟢", # Least serious }.get(classification, "⚪") output.append(f"{severity_emoji} **{classification}** - {status}") # Date if init_date := recall.get("recall_initiation_date"): formatted_date = f"{init_date[:4]}-{init_date[4:6]}-{init_date[6:]}" output.append(f"**Initiated**: {formatted_date}") # Product description if product_desc := recall.get("product_description"): cleaned = truncate_text(clean_text(product_desc), 200) output.append(f"**Product**: {cleaned}") # OpenFDA names openfda = recall.get("openfda", {}) if brand_names := openfda.get("brand_name"): output.append(f"**Brand**: {', '.join(brand_names[:3])}") # Reason for recall if reason := recall.get("reason_for_recall"): cleaned_reason = truncate_text(clean_text(reason), 300) output.append(f"\n**Reason**: {cleaned_reason}") # Firm name if firm := recall.get("recalling_firm"): output.append(f"\n**Recalling Firm**: {firm}") output.append("") return output def _format_recall_header(recall: dict[str, Any]) -> list[str]: """Format the header section of detailed recall.""" output = ["### Recall Information"] output.append( f"**Recall Number**: {recall.get('recall_number', 'Unknown')}" ) output.append( f"**Classification**: {recall.get('classification', 'Unknown')}" ) output.append(f"**Status**: {recall.get('status', 'Unknown')}") if event_id := recall.get("event_id"): output.append(f"**Event ID**: {event_id}") # Dates if init_date := recall.get("recall_initiation_date"): formatted = f"{init_date[:4]}-{init_date[4:6]}-{init_date[6:]}" output.append(f"**Initiation Date**: {formatted}") if report_date := recall.get("report_date"): formatted = f"{report_date[:4]}-{report_date[4:6]}-{report_date[6:]}" output.append(f"**Report Date**: {formatted}") if term_date := recall.get("termination_date"): formatted = f"{term_date[:4]}-{term_date[4:6]}-{term_date[6:]}" output.append(f"**Termination Date**: {formatted}") output.append("") return output def _format_recall_details(recall: dict[str, Any]) -> list[str]: """Format recall details and reason.""" output = ["### Product and Reason"] if product_desc := recall.get("product_description"): output.append(f"**Product Description**:\n{clean_text(product_desc)}") if reason := recall.get("reason_for_recall"): output.append(f"\n**Reason for Recall**:\n{clean_text(reason)}") if quantity := recall.get("product_quantity"): output.append(f"\n**Product Quantity**: {quantity}") if code_info := recall.get("code_info"): output.append(f"\n**Code Information**:\n{clean_text(code_info)}") output.append("") return output def _format_distribution_info(recall: dict[str, Any]) -> list[str]: """Format distribution information.""" output = ["### Distribution Information"] if firm := recall.get("recalling_firm"): output.append(f"**Recalling Firm**: {firm}") if city := recall.get("city"): state = recall.get("state", "") country = recall.get("country", "") location = city if state: location += f", {state}" if country: location += f", {country}" output.append(f"**Location**: {location}") if dist_pattern := recall.get("distribution_pattern"): output.append( f"\n**Distribution Pattern**:\n{clean_text(dist_pattern)}" ) if action := recall.get("voluntary_mandated"): output.append(f"\n**Action Type**: {action}") output.append("") return output def _format_recall_openfda(openfda: dict[str, Any]) -> list[str]: """Format OpenFDA metadata for recall.""" output = ["### Drug Information"] if brand_names := openfda.get("brand_name"): output.append(f"**Brand Names**: {', '.join(brand_names)}") if generic_names := openfda.get("generic_name"): output.append(f"**Generic Names**: {', '.join(generic_names)}") if manufacturers := openfda.get("manufacturer_name"): output.append(f"**Manufacturers**: {', '.join(manufacturers[:3])}") if ndas := openfda.get("application_number"): output.append(f"**Application Numbers**: {', '.join(ndas[:5])}") if routes := openfda.get("route"): output.append(f"**Routes**: {', '.join(routes)}") if pharm_class := openfda.get("pharm_class_epc"): output.append(f"**Pharmacologic Class**: {', '.join(pharm_class[:3])}") output.append("") return output ``` -------------------------------------------------------------------------------- /docs/workflows/all-workflows.md: -------------------------------------------------------------------------------- ```markdown # BioMCP Research Workflows Quick, practical workflows for common biomedical research tasks. ## 1. Literature Review Workflow ### Quick Start ```bash # Find key papers on BRAF V600E melanoma therapy biomcp article search --gene BRAF --disease melanoma \ --keyword "V600E|therapy|treatment" --limit 50 \ --format json > braf_papers.json ``` ### Full Workflow Script ```python import asyncio from biomcp import BioMCPClient async def literature_review(gene, disease, focus_terms): async with BioMCPClient() as client: # 1. Get gene context gene_info = await client.genes.get(gene) # 2. Search by topic results = {} for term in focus_terms: articles = await client.articles.search( genes=[gene], diseases=[disease], keywords=[term], limit=30 ) results[term] = articles.articles # 3. Generate summary print(f"\n{gene} in {disease}: Found {sum(len(v) for v in results.values())} articles") for topic, articles in results.items(): print(f"\n{topic}: {len(articles)} articles") for a in articles[:3]: print(f" - {a.title[:80]}... ({a.year})") return results # Run it asyncio.run(literature_review( "BRAF", "melanoma", ["resistance", "combination therapy", "immunotherapy"] )) ``` ### Key Points - Start broad, then narrow by topic - Use OR syntax for variant notations - Export results for citation management - Set up weekly searches for updates --- ## 2. Clinical Trial Matching Workflow ### Quick Start ```bash # Find trials for EGFR-mutant lung cancer near Boston biomcp trial search --condition "lung cancer" \ --term "EGFR mutation" --status RECRUITING \ --latitude 42.3601 --longitude -71.0589 --distance 100 ``` ### Patient Matching Script ```python async def match_patient_to_trials(patient_profile): async with BioMCPClient() as client: # 1. Search trials with location trials = await client.trials.search( conditions=[patient_profile['diagnosis']], other_terms=patient_profile['mutations'], lat=patient_profile['lat'], long=patient_profile['long'], distance=patient_profile['max_distance'], status="RECRUITING" ) # 2. Score trials scored = [] for trial in trials.trials[:20]: score = 0 # Location score if trial.distance < 50: score += 25 # Phase score if trial.phase == "PHASE3": score += 20 elif trial.phase == "PHASE2": score += 15 # Mutation match if any(mut in str(trial.eligibility) for mut in patient_profile['mutations']): score += 30 scored.append((score, trial)) # 3. Return top matches scored.sort(reverse=True, key=lambda x: x[0]) return [(s, t) for s, t in scored[:5]] # Example patient patient = { 'diagnosis': 'non-small cell lung cancer', 'mutations': ['EGFR L858R'], 'lat': 42.3601, 'long': -71.0589, 'max_distance': 100 } matches = asyncio.run(match_patient_to_trials(patient)) ``` ### Key Points - Always use coordinates for location search - Check both ClinicalTrials.gov and NCI sources - Contact trial sites directly for pre-screening - Consider travel burden in recommendations --- ## 3. Variant Interpretation Workflow ### Quick Start ```bash # Get variant annotations biomcp variant get rs121913529 # By rsID biomcp variant get "NM_007294.4:c.5266dupC" # By HGVS # Search pathogenic variants biomcp variant search --gene BRCA1 --significance pathogenic ``` ### Variant Analysis Script ```python async def interpret_variant(gene, variant_notation, cancer_type): async with BioMCPClient() as client: # 1. Get variant details try: variant = await client.variants.get(variant_notation) significance = variant.clinical_significance frequency = variant.frequencies.gnomad if hasattr(variant, 'frequencies') else None except: significance = "Not found" frequency = None # 2. Search literature articles = await client.articles.search( genes=[gene], variants=[variant_notation], diseases=[cancer_type], limit=10 ) # 3. Find trials trials = await client.trials.search( conditions=[cancer_type], other_terms=[f"{gene} mutation"], status="RECRUITING", limit=5 ) # 4. Generate interpretation print(f"\nVariant: {gene} {variant_notation}") print(f"Significance: {significance}") print(f"Population Frequency: {frequency or 'Unknown'}") print(f"Literature: {len(articles.articles)} relevant papers") print(f"Clinical Trials: {len(trials.trials)} active trials") # Actionability assessment if significance in ["Pathogenic", "Likely pathogenic"]: if trials.trials: print("✓ ACTIONABLE - Clinical trials available") else: print("⚠ Pathogenic but no targeted trials") return { 'significance': significance, 'frequency': frequency, 'articles': len(articles.articles), 'trials': len(trials.trials) } # Run it asyncio.run(interpret_variant("BRAF", "p.V600E", "melanoma")) ``` ### Key Points - Check multiple databases (MyVariant, ClinVar via articles) - Consider cancer type for interpretation - Look for FDA-approved therapies - Document tier classification --- ## 4. Quick Integration Patterns ### Batch Processing ```python # Process multiple queries efficiently async def batch_analysis(items): async with BioMCPClient() as client: tasks = [] for item in items: if item['type'] == 'gene': tasks.append(client.genes.get(item['id'])) elif item['type'] == 'variant': tasks.append(client.variants.get(item['id'])) results = await asyncio.gather(*tasks, return_exceptions=True) return results ``` ### Error Handling ```python from biomcp.exceptions import NotFoundError, RateLimitError import time async def robust_search(search_func, **params): retries = 3 for attempt in range(retries): try: return await search_func(**params) except RateLimitError as e: if attempt < retries - 1: time.sleep(2 ** attempt) # Exponential backoff else: raise except NotFoundError: return None ``` ### Caching Results ```python from functools import lru_cache import json # Simple file-based cache def cache_results(filename): def decorator(func): async def wrapper(*args, **kwargs): # Check cache try: with open(filename, 'r') as f: return json.load(f) except FileNotFoundError: pass # Fetch and cache result = await func(*args, **kwargs) with open(filename, 'w') as f: json.dump(result, f) return result return wrapper return decorator @cache_results('gene_cache.json') async def get_gene_info(gene): async with BioMCPClient() as client: return await client.genes.get(gene) ``` --- ## Complete Example: Precision Medicine Report ```python async def generate_precision_medicine_report(patient): """Generate comprehensive report for molecular tumor board.""" async with BioMCPClient() as client: report = { 'patient_id': patient['id'], 'date': datetime.now().isoformat(), 'variants': [], 'trials': [], 'therapies': [] } # Analyze each variant for variant in patient['variants']: # Get annotations var_info = await robust_search( client.variants.search, gene=variant['gene'], hgvs=variant['hgvs'] ) # Search literature articles = await client.articles.search( genes=[variant['gene']], diseases=[patient['cancer_type']], keywords=['therapy', 'treatment'], limit=5 ) # Find trials trials = await client.trials.search( conditions=[patient['cancer_type']], other_terms=[f"{variant['gene']} mutation"], status="RECRUITING", limit=3 ) report['variants'].append({ 'variant': variant, 'annotation': var_info, 'relevant_articles': len(articles.articles), 'available_trials': len(trials.trials) }) report['trials'].extend(trials.trials) # Generate summary print(f"\nPrecision Medicine Report - {patient['id']}") print(f"Cancer Type: {patient['cancer_type']}") print(f"Variants Analyzed: {len(report['variants'])}") print(f"Clinical Trials Found: {len(report['trials'])}") # Prioritize actionable findings actionable = [v for v in report['variants'] if v['available_trials'] > 0] if actionable: print(f"\n✓ {len(actionable)} ACTIONABLE variants with trial options") return report # Example usage patient = { 'id': 'PT001', 'cancer_type': 'lung adenocarcinoma', 'variants': [ {'gene': 'EGFR', 'hgvs': 'p.L858R'}, {'gene': 'TP53', 'hgvs': 'p.R273H'} ] } report = asyncio.run(generate_precision_medicine_report(patient)) ``` --- ## Tips for All Workflows 1. **Always start with the think tool** (for AI assistants) 2. **Use official gene symbols** - check genenames.org 3. **Batch API calls** when possible 4. **Handle errors gracefully** - APIs can be unavailable 5. **Cache frequently accessed data** - respect rate limits 6. **Document your process** - for reproducibility ## Next Steps - [Command Reference](../reference/quick-reference.md) - [API Documentation](../apis/python-sdk.md) - [Troubleshooting](../troubleshooting.md) ``` -------------------------------------------------------------------------------- /src/biomcp/trials/nci_search.py: -------------------------------------------------------------------------------- ```python """NCI Clinical Trials Search API integration for trial searches.""" import logging from typing import Any from ..constants import NCI_TRIALS_URL from ..diseases.search import search_diseases from ..integrations.cts_api import CTSAPIError, make_cts_request from ..interventions.search import search_interventions from .search import TrialQuery logger = logging.getLogger(__name__) async def _expand_disease_terms( conditions: list[str], expand_synonyms: bool, ) -> list[str]: """Expand disease terms with synonyms if requested.""" if not expand_synonyms: return conditions disease_terms = [] for condition in conditions: try: results = await search_diseases( name=condition, include_synonyms=True, page_size=5, ) # Add the original term plus any exact matches disease_terms.append(condition) for disease in results.get("diseases", [])[:3]: if disease.get("name"): disease_terms.append(disease["name"]) # Add top synonyms synonyms = disease.get("synonyms", []) if isinstance(synonyms, list): disease_terms.extend(synonyms[:2]) except Exception as e: logger.warning(f"Failed to expand disease term {condition}: {e}") disease_terms.append(condition) # Remove duplicates while preserving order seen = set() unique_diseases = [] for term in disease_terms: if term.lower() not in seen: seen.add(term.lower()) unique_diseases.append(term) return unique_diseases async def _normalize_interventions(interventions: list[str]) -> list[str]: """Normalize intervention names to IDs where possible.""" intervention_ids = [] for intervention in interventions: try: results = await search_interventions( name=intervention, page_size=1, ) interventions_data = results.get("interventions", []) if interventions_data: # Use the ID if available, otherwise the name int_id = interventions_data[0].get("id", intervention) intervention_ids.append(int_id) else: intervention_ids.append(intervention) except Exception: intervention_ids.append(intervention) return intervention_ids def _map_phase_to_nci(phase: Any) -> str | None: """Map TrialPhase enum to NCI phase values.""" if not phase: return None phase_map = { "EARLY_PHASE1": "I", "PHASE1": "I", "PHASE2": "II", "PHASE3": "III", "PHASE4": "IV", "NOT_APPLICABLE": "NA", } return phase_map.get(phase.value, phase.value) def _map_status_to_nci(recruiting_status: Any) -> list[str] | None: """Map RecruitingStatus enum to NCI status values.""" if not recruiting_status: return None status_map = { "OPEN": ["recruiting", "enrolling_by_invitation"], "CLOSED": ["active_not_recruiting", "completed", "terminated"], "ANY": None, } return status_map.get(recruiting_status.value) def _map_sort_to_nci(sort: Any) -> str | None: """Map SortOrder enum to NCI sort values.""" if not sort: return None sort_map = { "RELEVANCE": "relevance", "LAST_UPDATE": "last_update_date", "START_DATE": "start_date", "COMPLETION_DATE": "completion_date", } return sort_map.get(sort.value) def _add_location_params(params: dict[str, Any], query: TrialQuery) -> None: """Add location parameters if present.""" if query.lat is not None and query.long is not None: params["latitude"] = query.lat params["longitude"] = query.long params["distance"] = query.distance or 50 def _add_eligibility_params(params: dict[str, Any], query: TrialQuery) -> None: """Add advanced eligibility criteria parameters.""" if query.prior_therapies: params["prior_therapy"] = query.prior_therapies if query.required_mutations: params["biomarkers"] = query.required_mutations if query.allow_brain_mets is not None: params["accepts_brain_mets"] = query.allow_brain_mets async def convert_query_to_nci(query: TrialQuery) -> dict[str, Any]: """ Convert a TrialQuery object to NCI CTS API parameters. Maps BioMCP's TrialQuery fields to NCI's parameter structure. """ params: dict[str, Any] = {} # Basic search terms if query.terms: params["_fulltext"] = " ".join(query.terms) # Conditions/diseases with synonym expansion if query.conditions: disease_terms = await _expand_disease_terms( query.conditions, query.expand_synonyms, ) if disease_terms: params["diseases"] = disease_terms # Interventions if query.interventions: params["interventions"] = await _normalize_interventions( query.interventions ) # NCT IDs if query.nct_ids: params["nct_ids"] = query.nct_ids # Phase and status mappings nci_phase = _map_phase_to_nci(query.phase) if nci_phase: params["phase"] = nci_phase statuses = _map_status_to_nci(query.recruiting_status) if statuses: params["recruitment_status"] = statuses # Location and eligibility _add_location_params(params, query) _add_eligibility_params(params, query) # Pagination params["size"] = query.page_size if query.page_size else 20 # Sort order sort_value = _map_sort_to_nci(query.sort) if sort_value: params["sort"] = sort_value return params async def search_trials_nci( query: TrialQuery, api_key: str | None = None, ) -> dict[str, Any]: """ Search for clinical trials using NCI CTS API. Returns: Dictionary with: - trials: List of trial records - total: Total number of results - next_page: Token for next page (if available) - source: "nci" to indicate data source """ try: # Convert query to NCI parameters params = await convert_query_to_nci(query) # Make API request response = await make_cts_request( url=NCI_TRIALS_URL, params=params, api_key=api_key, ) # Process response trials = response.get("data", response.get("trials", [])) total = response.get("total", len(trials)) next_page = response.get("next_page_token") return { "trials": trials, "total": total, "next_page": next_page, "source": "nci", } except CTSAPIError: raise except Exception as e: logger.error(f"NCI trial search failed: {e}") raise CTSAPIError(f"Trial search failed: {e!s}") from e def _format_trial_header(trial: dict[str, Any]) -> list[str]: """Format trial header with basic info.""" nct_id = trial.get("nct_id", trial.get("protocol_id", "Unknown")) title = trial.get("title", trial.get("brief_title", "Untitled")) phase = trial.get("phase", "Not specified") status = trial.get("overall_status", trial.get("status", "Unknown")) return [ f"### [{nct_id}] {title}", f"- **Phase**: {phase}", f"- **Status**: {status}", ] def _format_trial_summary_text(trial: dict[str, Any]) -> list[str]: """Format trial summary text if available.""" summary = trial.get("brief_summary", trial.get("description", "")) if not summary: return [] if len(summary) > 200: summary = summary[:197] + "..." return [f"- **Summary**: {summary}"] def _format_trial_conditions(trial: dict[str, Any]) -> list[str]: """Format trial conditions/diseases.""" conditions = trial.get("diseases", trial.get("conditions", [])) if not conditions: return [] lines = [] if isinstance(conditions, list): lines.append(f"- **Conditions**: {', '.join(conditions[:3])}") if len(conditions) > 3: lines.append(f" *(and {len(conditions) - 3} more)*") else: lines.append(f"- **Conditions**: {conditions}") return lines def _format_trial_interventions(trial: dict[str, Any]) -> list[str]: """Format trial interventions.""" interventions = trial.get("interventions", []) if not interventions: return [] int_names = [] for intervention in interventions[:3]: if isinstance(intervention, dict): int_names.append(intervention.get("name", "Unknown")) else: int_names.append(str(intervention)) if not int_names: return [] lines = [f"- **Interventions**: {', '.join(int_names)}"] if len(interventions) > 3: lines.append(f" *(and {len(interventions) - 3} more)*") return lines def _format_trial_metadata(trial: dict[str, Any]) -> list[str]: """Format trial metadata (sponsor, eligibility notes).""" lines = [] lead_org = trial.get("lead_org", trial.get("sponsor", "")) if lead_org: lines.append(f"- **Lead Organization**: {lead_org}") if trial.get("accepts_brain_mets"): lines.append("- **Note**: Accepts patients with brain metastases") return lines def _format_trial_summary(trial: dict[str, Any]) -> list[str]: """Format a single trial summary.""" lines = [] # Add header info lines.extend(_format_trial_header(trial)) # Add summary text lines.extend(_format_trial_summary_text(trial)) # Add conditions lines.extend(_format_trial_conditions(trial)) # Add interventions lines.extend(_format_trial_interventions(trial)) # Add metadata lines.extend(_format_trial_metadata(trial)) lines.append("") return lines def format_nci_trial_results(results: dict[str, Any]) -> str: """ Format NCI trial search results as markdown. """ trials = results.get("trials", []) total = results.get("total", 0) if not trials: return "No trials found matching the search criteria in NCI database." lines = [ f"## NCI Clinical Trials Search Results ({total} found)", "", "*Source: NCI Clinical Trials Search API*", "", ] for trial in trials: lines.extend(_format_trial_summary(trial)) return "\n".join(lines) ``` -------------------------------------------------------------------------------- /src/biomcp/variants/alphagenome.py: -------------------------------------------------------------------------------- ```python """AlphaGenome integration for variant effect prediction.""" import logging import os import re from typing import Any, TypedDict from ..utils.request_cache import request_cache logger = logging.getLogger(__name__) # Default threshold for significant changes DEFAULT_SIGNIFICANCE_THRESHOLD = 0.5 # Chromosome pattern for validation CHROMOSOME_PATTERN = re.compile(r"^chr([1-9]|1[0-9]|2[0-2]|X|Y|M|MT)$") # Valid nucleotide characters VALID_NUCLEOTIDES = set("ACGT") class VariantPrediction(TypedDict): """Type definition for variant prediction results.""" gene_expression: dict[str, float] chromatin_accessibility: dict[str, float] splicing_effects: list[str] summary_stats: dict[str, int] @request_cache(ttl=1800) # Cache for 30 minutes async def predict_variant_effects( chromosome: str, position: int, reference: str, alternate: str, interval_size: int = 131_072, tissue_types: list[str] | None = None, significance_threshold: float = DEFAULT_SIGNIFICANCE_THRESHOLD, api_key: str | None = None, ) -> str: """ Predict variant effects using AlphaGenome. Args: chromosome: Chromosome (e.g., 'chr7') position: 1-based genomic position reference: Reference allele(s) alternate: Alternate allele(s) interval_size: Size of genomic context window (max 1,000,000) tissue_types: Optional UBERON ontology terms for tissue-specific predictions significance_threshold: Threshold for significant changes (default 0.5) api_key: Optional API key (if not provided, uses ALPHAGENOME_API_KEY env var) Returns: Formatted markdown string with predictions Raises: ValueError: If input parameters are invalid """ # Validate inputs _validate_inputs(chromosome, position, reference, alternate) # Check for API key (prefer parameter over environment variable) if not api_key: api_key = os.getenv("ALPHAGENOME_API_KEY") if not api_key: return ( "❌ **AlphaGenome API key required**\n\n" "I need an API key to use AlphaGenome. Please provide it by either:\n\n" "**Option 1: Include your key in your request**\n" 'Say: "My AlphaGenome API key is YOUR_KEY_HERE" and I\'ll use it for this prediction.\n\n' "**Option 2: Set it as an environment variable (for persistent use)**\n" "```bash\n" "export ALPHAGENOME_API_KEY='your-key'\n" "```\n\n" "Get a free API key at: https://deepmind.google.com/science/alphagenome\n\n" "**ACTION REQUIRED**: Please provide your API key using Option 1 above to continue." ) # Try to import AlphaGenome try: # Suppress protobuf version warnings import warnings warnings.filterwarnings( "ignore", category=UserWarning, module="google.protobuf.runtime_version", ) from alphagenome.data import genome from alphagenome.models import dna_client, variant_scorers except ImportError: return ( "❌ **AlphaGenome not installed**\n\n" "To install:\n" "```bash\n" "git clone https://github.com/google-deepmind/alphagenome.git\n" "cd alphagenome && pip install .\n" "```\n\n" "Standard variant annotations are still available via `variant_searcher`." ) try: # Create client model = dna_client.create(api_key) # Calculate interval boundaries (ensure within supported sizes) # Supported sizes: 2048, 16384, 131072, 524288, 1048576 supported_sizes = [2048, 16384, 131072, 524288, 1048576] # Find smallest supported size that's >= requested size valid_sizes = [s for s in supported_sizes if s >= interval_size] if not valid_sizes: # If requested size is larger than max, use max interval_size = supported_sizes[-1] else: interval_size = min(valid_sizes) half_size = interval_size // 2 interval_start = max(0, position - half_size - 1) # Convert to 0-based interval_end = interval_start + interval_size # Create interval and variant objects interval = genome.Interval( chromosome=chromosome, start=interval_start, end=interval_end ) variant = genome.Variant( chromosome=chromosome, position=position, reference_bases=reference, alternate_bases=alternate, ) # Get recommended scorers for human scorers = variant_scorers.get_recommended_scorers(organism="human") # Make prediction scores = model.score_variant( interval=interval, variant=variant, variant_scorers=scorers ) # Format results return _format_predictions( variant, scores, interval_size, significance_threshold ) except Exception as e: logger.error(f"AlphaGenome prediction failed: {e}", exc_info=True) error_context = ( f"❌ **AlphaGenome prediction failed**\n\n" f"Error: {e!s}\n\n" f"**Context:**\n" f"- Variant: {chromosome}:{position} {reference}>{alternate}\n" f"- Interval size: {interval_size:,} bp\n" f"- Tissue types: {tissue_types or 'None specified'}" ) return error_context def _format_predictions( variant: Any, scores: list[Any], interval_size: int, significance_threshold: float = DEFAULT_SIGNIFICANCE_THRESHOLD, ) -> str: """Format AlphaGenome predictions into markdown. Args: variant: The variant object from AlphaGenome scores: List of prediction scores interval_size: Size of the genomic context window significance_threshold: Threshold for significant changes Returns: Formatted markdown string """ try: from alphagenome.models import variant_scorers # Convert scores to DataFrame scores_df = variant_scorers.tidy_scores(scores) # Start building the output lines = [ "## AlphaGenome Variant Effect Predictions\n", f"**Variant**: {variant.chromosome}:{variant.position} {variant.reference_bases}>{variant.alternate_bases}", f"**Analysis window**: {interval_size:,} bp\n", ] # Group scores by output type if not scores_df.empty: # Gene expression effects expr_scores = scores_df[ scores_df["output_type"].str.contains("RNA_SEQ", na=False) ] if not expr_scores.empty: top_expr = expr_scores.loc[ expr_scores["raw_score"].abs().idxmax() ] gene = top_expr.get("gene_name", "Unknown") score = top_expr["raw_score"] direction = "↓ decreases" if score < 0 else "↑ increases" lines.append("\n### Gene Expression") lines.append( f"- **{gene}**: {score:+.2f} log₂ fold change ({direction} expression)" ) # Chromatin accessibility chrom_scores = scores_df[ scores_df["output_type"].str.contains("ATAC|DNASE", na=False) ] if not chrom_scores.empty: top_chrom = chrom_scores.loc[ chrom_scores["raw_score"].abs().idxmax() ] score = top_chrom["raw_score"] track = top_chrom.get("track_name", "tissue") direction = "↓ decreases" if score < 0 else "↑ increases" lines.append("\n### Chromatin Accessibility") lines.append( f"- **{track}**: {score:+.2f} log₂ change ({direction} accessibility)" ) # Splicing effects splice_scores = scores_df[ scores_df["output_type"].str.contains("SPLICE", na=False) ] if not splice_scores.empty: lines.append("\n### Splicing") lines.append("- Potential splicing alterations detected") # Summary statistics total_tracks = len(scores_df) significant = len( scores_df[ scores_df["raw_score"].abs() > significance_threshold ] ) lines.append("\n### Summary") lines.append(f"- Analyzed {total_tracks} regulatory tracks") lines.append( f"- {significant} tracks show substantial changes (|log₂| > {significance_threshold})" ) else: lines.append("\n*No significant regulatory effects predicted*") return "\n".join(lines) except Exception as e: logger.error(f"Failed to format predictions: {e}") return f"## AlphaGenome Results\n\nPrediction completed but formatting failed: {e!s}" def _validate_inputs( chromosome: str, position: int, reference: str, alternate: str ) -> None: """Validate input parameters for variant prediction. Args: chromosome: Chromosome identifier position: Genomic position reference: Reference allele(s) alternate: Alternate allele(s) Raises: ValueError: If any input is invalid """ # Validate chromosome format if not CHROMOSOME_PATTERN.match(chromosome): raise ValueError( f"Invalid chromosome format: {chromosome}. " "Expected format: chr1-22, chrX, chrY, chrM, or chrMT" ) # Validate position if position < 1: raise ValueError(f"Position must be >= 1, got {position}") # Validate nucleotides ref_upper = reference.upper() alt_upper = alternate.upper() if not ref_upper: raise ValueError("Reference allele cannot be empty") if not alt_upper: raise ValueError("Alternate allele cannot be empty") invalid_ref = set(ref_upper) - VALID_NUCLEOTIDES if invalid_ref: raise ValueError( f"Invalid nucleotides in reference allele: {invalid_ref}. " f"Only A, C, G, T are allowed" ) invalid_alt = set(alt_upper) - VALID_NUCLEOTIDES if invalid_alt: raise ValueError( f"Invalid nucleotides in alternate allele: {invalid_alt}. " f"Only A, C, G, T are allowed" ) ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/02-biothings-suite.md: -------------------------------------------------------------------------------- ```markdown # BioThings Suite API Reference The BioThings Suite provides unified access to biomedical annotations across genes, variants, diseases, and drugs through a consistent API interface. ## Usage Examples For practical examples using the BioThings APIs, see: - [How to Find Trials with NCI and BioThings](../how-to-guides/02-find-trials-with-nci-and-biothings.md#biothings-integration-for-enhanced-search) - [Get Comprehensive Variant Annotations](../how-to-guides/03-get-comprehensive-variant-annotations.md#integration-with-other-biomcp-tools) ## Overview BioMCP integrates with four BioThings APIs: - **MyGene.info**: Gene annotations and functional information - **MyVariant.info**: Genetic variant annotations and clinical significance - **MyDisease.info**: Disease ontology and terminology mappings - **MyChem.info**: Drug/chemical properties and mechanisms All APIs share: - RESTful JSON interface - No authentication required - Elasticsearch-based queries - Comprehensive data aggregation ## MyGene.info ### Base URL `https://mygene.info/v1/` ### Key Endpoints #### Gene Query ``` GET /query?q={query} ``` **Parameters:** - `q`: Query string (gene symbol, name, or ID) - `fields`: Specific fields to return - `species`: Limit to species (default: human, mouse, rat) - `size`: Number of results (default: 10) **Example:** ```bash curl "https://mygene.info/v1/query?q=BRAF&fields=symbol,name,summary,type_of_gene" ``` #### Gene Annotation ``` GET /gene/{geneid} ``` **Gene ID formats:** - Entrez Gene ID: `673` - Ensembl ID: `ENSG00000157764` - Gene Symbol: `BRAF` **Example:** ```bash curl "https://mygene.info/v1/gene/673?fields=symbol,name,summary,genomic_pos,pathway,go" ``` ### Important Fields | Field | Description | Example | | ------------- | ---------------------- | --------------------------------------- | | `symbol` | Official gene symbol | "BRAF" | | `name` | Full gene name | "B-Raf proto-oncogene" | | `entrezgene` | NCBI Entrez ID | 673 | | `summary` | Functional description | "This gene encodes..." | | `genomic_pos` | Chromosomal location | {"chr": "7", "start": 140433812} | | `pathway` | Pathway memberships | {"kegg": [...], "reactome": [...]} | | `go` | Gene Ontology terms | {"BP": [...], "MF": [...], "CC": [...]} | ## MyVariant.info ### Base URL `https://myvariant.info/v1/` ### Key Endpoints #### Variant Query ``` GET /query?q={query} ``` **Query syntax:** - Gene + variant: `dbnsfp.genename:BRAF AND dbnsfp.hgvsp:p.V600E` - rsID: `dbsnp.rsid:rs121913529` - Genomic: `_id:chr7:g.140453136A>T` **Example:** ```bash curl "https://myvariant.info/v1/query?q=dbnsfp.genename:TP53&fields=_id,clinvar,gnomad_exome" ``` #### Variant Annotation ``` GET /variant/{variant_id} ``` **ID formats:** - HGVS genomic: `chr7:g.140453136A>T` - dbSNP: `rs121913529` ### Important Fields | Field | Description | Example | | -------------- | ---------------------- | --------------------------------------- | | `clinvar` | Clinical significance | {"clinical_significance": "Pathogenic"} | | `dbsnp` | dbSNP annotations | {"rsid": "rs121913529"} | | `cadd` | CADD scores | {"phred": 35} | | `gnomad_exome` | Population frequency | {"af": {"af": 0.00001}} | | `dbnsfp` | Functional predictions | {"polyphen2": "probably_damaging"} | ### Query Filters ```python # Clinical significance q = "clinvar.clinical_significance:pathogenic" # Frequency filters q = "gnomad_exome.af.af:<0.01" # Rare variants # Gene-specific q = "dbnsfp.genename:BRCA1 AND cadd.phred:>20" ``` ## MyDisease.info ### Base URL `https://mydisease.info/v1/` ### Key Endpoints #### Disease Query ``` GET /query?q={query} ``` **Example:** ```bash curl "https://mydisease.info/v1/query?q=melanoma&fields=mondo,disease_ontology,synonyms" ``` #### Disease Annotation ``` GET /disease/{disease_id} ``` **ID formats:** - MONDO: `MONDO:0007254` - DOID: `DOID:1909` - OMIM: `OMIM:155600` ### Important Fields | Field | Description | Example | | ------------------ | ----------------- | -------------------------------------------- | | `mondo` | MONDO ontology | {"id": "MONDO:0007254", "label": "melanoma"} | | `disease_ontology` | Disease Ontology | {"id": "DOID:1909"} | | `synonyms` | Alternative names | ["malignant melanoma", "MM"] | | `xrefs` | Cross-references | {"omim": ["155600"], "mesh": ["D008545"]} | | `phenotypes` | HPO terms | [{"hpo_id": "HP:0002861"}] | ## MyChem.info ### Base URL `https://mychem.info/v1/` ### Key Endpoints #### Drug Query ``` GET /query?q={query} ``` **Example:** ```bash curl "https://mychem.info/v1/query?q=imatinib&fields=drugbank,chembl,chebi" ``` #### Drug Annotation ``` GET /drug/{drug_id} ``` **ID formats:** - DrugBank: `DB00619` - ChEMBL: `CHEMBL941` - Name: `imatinib` ### Important Fields | Field | Description | Example | | -------------- | -------------- | -------------------------------------------- | | `drugbank` | DrugBank data | {"id": "DB00619", "name": "Imatinib"} | | `chembl` | ChEMBL data | {"molecule_chembl_id": "CHEMBL941"} | | `chebi` | ChEBI ontology | {"id": "CHEBI:45783"} | | `drugcentral` | Indications | {"indications": [...]} | | `pharmacology` | Mechanism | {"mechanism_of_action": "BCR-ABL inhibitor"} | ## Common Query Patterns ### 1. Gene to Variant Pipeline ```python # Step 1: Get gene info gene_response = requests.get( "https://mygene.info/v1/gene/BRAF", params={"fields": "symbol,genomic_pos"} ) # Step 2: Find variants in gene variant_response = requests.get( "https://myvariant.info/v1/query", params={ "q": "dbnsfp.genename:BRAF", "fields": "clinvar.clinical_significance,gnomad_exome.af", "size": 100 } ) ``` ### 2. Disease Synonym Expansion ```python # Get all synonyms for a disease disease_response = requests.get( "https://mydisease.info/v1/query", params={ "q": "melanoma", "fields": "mondo,synonyms,xrefs" } ) # Extract all names all_names = ["melanoma"] for hit in disease_response.json()["hits"]: if "synonyms" in hit: all_names.extend(hit["synonyms"]) ``` ### 3. Drug Target Lookup ```python # Find drugs targeting a gene drug_response = requests.get( "https://mychem.info/v1/query", params={ "q": "drugcentral.targets.gene_symbol:BRAF", "fields": "drugbank.name,chembl.pref_name", "size": 50 } ) ``` ## Rate Limits and Best Practices ### Rate Limits - **Default**: 1,000 requests/hour per IP - **Batch queries**: Up to 1,000 IDs per request - **No authentication**: Public access ### Best Practices #### 1. Use Field Filtering ```python # Good - only request needed fields params = {"fields": "symbol,name,summary"} # Bad - returns all fields params = {} ``` #### 2. Batch Requests ```python # Good - single request for multiple genes response = requests.post( "https://mygene.info/v1/gene", json={"ids": ["BRAF", "KRAS", "EGFR"]} ) # Bad - multiple individual requests for gene in ["BRAF", "KRAS", "EGFR"]: requests.get(f"https://mygene.info/v1/gene/{gene}") ``` #### 3. Handle Missing Data ```python # Check for field existence if "clinvar" in variant and "clinical_significance" in variant["clinvar"]: significance = variant["clinvar"]["clinical_significance"] else: significance = "Not available" ``` ## Error Handling ### Common Errors #### 404 Not Found ```json { "success": false, "error": "ID not found" } ``` #### 400 Bad Request ```json { "success": false, "error": "Invalid query syntax" } ``` #### 429 Rate Limited ```json { "success": false, "error": "Rate limit exceeded" } ``` ### Error Handling Code ```python def query_biothings(api_url, query_params): try: response = requests.get(api_url, params=query_params) response.raise_for_status() return response.json() except requests.exceptions.HTTPError as e: if e.response.status_code == 404: return {"error": "Not found", "query": query_params} elif e.response.status_code == 429: # Implement exponential backoff time.sleep(60) return query_biothings(api_url, query_params) else: raise ``` ## Data Sources Each BioThings API aggregates data from multiple sources: ### MyGene.info Sources - NCBI Entrez Gene - Ensembl - UniProt - KEGG, Reactome, WikiPathways - Gene Ontology ### MyVariant.info Sources - dbSNP - ClinVar - gnomAD - CADD - PolyPhen-2, SIFT - COSMIC ### MyDisease.info Sources - MONDO - Disease Ontology - OMIM - MeSH - HPO ### MyChem.info Sources - DrugBank - ChEMBL - ChEBI - PubChem - DrugCentral ## Advanced Features ### Full-Text Search ```python # Search across all fields params = { "q": "lung cancer EGFR", # Searches all text fields "fields": "symbol,name,summary" } ``` ### Faceted Search ```python # Get aggregations params = { "q": "clinvar.clinical_significance:pathogenic", "facets": "dbnsfp.genename", "size": 0 # Only return facets } ``` ### Scrolling Large Results ```python # For results > 10,000 params = { "q": "dbnsfp.genename:TP53", "fetch_all": True, "fields": "_id" } ``` ## Integration Tips ### 1. Caching Strategy - Cache gene/drug/disease lookups (stable) - Don't cache variant queries (frequently updated) - Use ETags for conditional requests ### 2. Parallel Requests ```python import asyncio import aiohttp async def fetch_all(session, urls): tasks = [] for url in urls: tasks.append(session.get(url)) return await asyncio.gather(*tasks) ``` ### 3. Data Normalization ```python def normalize_gene_symbol(symbol): # Query MyGene to get official symbol response = requests.get( f"https://mygene.info/v1/query?q={symbol}" ) if response.json()["hits"]: return response.json()["hits"][0]["symbol"] return symbol ``` ``` -------------------------------------------------------------------------------- /tests/tdd/test_biothings_integration.py: -------------------------------------------------------------------------------- ```python """Unit tests for BioThings API integration.""" from unittest.mock import AsyncMock, patch import pytest from biomcp.integrations import BioThingsClient, DiseaseInfo, GeneInfo @pytest.fixture def mock_http_client(): """Mock the http_client.request_api function.""" with patch("biomcp.integrations.biothings_client.http_client") as mock: yield mock @pytest.fixture def biothings_client(): """Create a BioThings client instance.""" return BioThingsClient() class TestGeneInfo: """Test gene information retrieval.""" @pytest.mark.asyncio async def test_get_gene_by_symbol( self, biothings_client, mock_http_client ): """Test getting gene info by symbol.""" # Mock query response mock_http_client.request_api = AsyncMock( side_effect=[ ( { "hits": [ { "_id": "7157", "symbol": "TP53", "name": "tumor protein p53", "taxid": 9606, } ] }, None, ), # Mock get response ( { "_id": "7157", "symbol": "TP53", "name": "tumor protein p53", "summary": "This gene encodes a tumor suppressor protein...", "alias": ["p53", "LFS1"], "type_of_gene": "protein-coding", "entrezgene": 7157, }, None, ), ] ) result = await biothings_client.get_gene_info("TP53") assert result is not None assert isinstance(result, GeneInfo) assert result.symbol == "TP53" assert result.name == "tumor protein p53" assert result.gene_id == "7157" assert "p53" in result.alias @pytest.mark.asyncio async def test_get_gene_by_id(self, biothings_client, mock_http_client): """Test getting gene info by Entrez ID.""" # Mock direct get response mock_http_client.request_api = AsyncMock( return_value=( { "_id": "7157", "symbol": "TP53", "name": "tumor protein p53", "summary": "This gene encodes a tumor suppressor protein...", }, None, ) ) result = await biothings_client.get_gene_info("7157") assert result is not None assert result.symbol == "TP53" assert result.gene_id == "7157" @pytest.mark.asyncio async def test_gene_not_found(self, biothings_client, mock_http_client): """Test handling of gene not found.""" mock_http_client.request_api = AsyncMock( return_value=({"hits": []}, None) ) result = await biothings_client.get_gene_info("INVALID_GENE") assert result is None @pytest.mark.asyncio async def test_batch_get_genes(self, biothings_client, mock_http_client): """Test batch gene retrieval.""" mock_http_client.request_api = AsyncMock( return_value=( [ { "_id": "7157", "symbol": "TP53", "name": "tumor protein p53", }, { "_id": "673", "symbol": "BRAF", "name": "B-Raf proto-oncogene", }, ], None, ) ) results = await biothings_client.batch_get_genes(["TP53", "BRAF"]) assert len(results) == 2 assert results[0].symbol == "TP53" assert results[1].symbol == "BRAF" class TestDiseaseInfo: """Test disease information retrieval.""" @pytest.mark.asyncio async def test_get_disease_by_name( self, biothings_client, mock_http_client ): """Test getting disease info by name.""" # Mock query response mock_http_client.request_api = AsyncMock( side_effect=[ ( { "hits": [ { "_id": "MONDO:0007959", "name": "melanoma", "mondo": {"mondo": "MONDO:0007959"}, } ] }, None, ), # Mock get response ( { "_id": "MONDO:0007959", "name": "melanoma", "mondo": { "definition": "A malignant neoplasm composed of melanocytes.", "synonym": { "exact": [ "malignant melanoma", "naevocarcinoma", ] }, }, }, None, ), ] ) result = await biothings_client.get_disease_info("melanoma") assert result is not None assert isinstance(result, DiseaseInfo) assert result.name == "melanoma" assert result.disease_id == "MONDO:0007959" assert "malignant melanoma" in result.synonyms @pytest.mark.asyncio async def test_get_disease_by_id(self, biothings_client, mock_http_client): """Test getting disease info by MONDO ID.""" mock_http_client.request_api = AsyncMock( return_value=( { "_id": "MONDO:0016575", "name": "GIST", "mondo": { "definition": "Gastrointestinal stromal tumor...", }, }, None, ) ) result = await biothings_client.get_disease_info("MONDO:0016575") assert result is not None assert result.name == "GIST" assert result.disease_id == "MONDO:0016575" @pytest.mark.asyncio async def test_get_disease_synonyms( self, biothings_client, mock_http_client ): """Test getting disease synonyms for query expansion.""" mock_http_client.request_api = AsyncMock( side_effect=[ ( { "hits": [ { "_id": "MONDO:0018076", "name": "GIST", } ] }, None, ), ( { "_id": "MONDO:0018076", "name": "gastrointestinal stromal tumor", "mondo": { "synonym": { "exact": [ "GIST", "gastrointestinal stromal tumour", "GI stromal tumor", ] } }, }, None, ), ] ) synonyms = await biothings_client.get_disease_synonyms("GIST") assert "GIST" in synonyms assert "gastrointestinal stromal tumor" in synonyms assert len(synonyms) <= 5 # Limited to 5 class TestTrialSynonymExpansion: """Test disease synonym expansion in trial searches.""" @pytest.mark.asyncio async def test_trial_search_with_synonym_expansion(self): """Test that trial search expands disease synonyms.""" from biomcp.trials.search import TrialQuery, convert_query with patch("biomcp.trials.search.BioThingsClient") as mock_client: # Mock synonym expansion mock_instance = mock_client.return_value mock_instance.get_disease_synonyms = AsyncMock( return_value=[ "GIST", "gastrointestinal stromal tumor", "GI stromal tumor", ] ) query = TrialQuery( conditions=["GIST"], expand_synonyms=True, ) params = await convert_query(query) # Check that conditions were expanded assert "query.cond" in params cond_value = params["query.cond"][0] assert "GIST" in cond_value assert "gastrointestinal stromal tumor" in cond_value @pytest.mark.asyncio async def test_trial_search_without_synonym_expansion(self): """Test that trial search works without synonym expansion.""" from biomcp.trials.search import TrialQuery, convert_query query = TrialQuery( conditions=["GIST"], expand_synonyms=False, ) params = await convert_query(query) # Check that conditions were not expanded assert "query.cond" in params assert params["query.cond"] == ["GIST"] class TestErrorHandling: """Test error handling in BioThings integration.""" @pytest.mark.asyncio async def test_api_error_handling( self, biothings_client, mock_http_client ): """Test handling of API errors.""" from biomcp.http_client import RequestError mock_http_client.request_api = AsyncMock( return_value=( None, RequestError(code=500, message="Internal server error"), ) ) result = await biothings_client.get_gene_info("TP53") assert result is None @pytest.mark.asyncio async def test_invalid_response_format( self, biothings_client, mock_http_client ): """Test handling of invalid API responses.""" mock_http_client.request_api = AsyncMock( return_value=({"invalid": "response"}, None) ) result = await biothings_client.get_gene_info("TP53") assert result is None ``` -------------------------------------------------------------------------------- /src/biomcp/http_client.py: -------------------------------------------------------------------------------- ```python import csv import json import os import ssl from io import StringIO from ssl import PROTOCOL_TLS_CLIENT, SSLContext, TLSVersion from typing import Literal, TypeVar import certifi from diskcache import Cache from platformdirs import user_cache_dir from pydantic import BaseModel from .circuit_breaker import CircuitBreakerConfig, circuit_breaker from .constants import ( AGGRESSIVE_INITIAL_RETRY_DELAY, AGGRESSIVE_MAX_RETRY_ATTEMPTS, AGGRESSIVE_MAX_RETRY_DELAY, DEFAULT_CACHE_TIMEOUT, DEFAULT_FAILURE_THRESHOLD, DEFAULT_RECOVERY_TIMEOUT, DEFAULT_SUCCESS_THRESHOLD, ) from .http_client_simple import execute_http_request from .metrics import Timer from .rate_limiter import domain_limiter from .retry import ( RetryableHTTPError, RetryConfig, is_retryable_status, with_retry, ) from .utils.endpoint_registry import get_registry T = TypeVar("T", bound=BaseModel) class RequestError(BaseModel): code: int message: str _cache: Cache | None = None def get_cache() -> Cache: global _cache if _cache is None: cache_path = os.path.join(user_cache_dir("biomcp"), "http_cache") _cache = Cache(cache_path) return _cache def generate_cache_key(method: str, url: str, params: dict) -> str: """Generate cache key using Python's built-in hash function for speed.""" # Handle simple cases without params if not params: return f"{method.upper()}:{url}" # Use Python's built-in hash with a fixed seed for consistency # This is much faster than SHA256 for cache keys params_str = json.dumps(params, sort_keys=True, separators=(",", ":")) key_source = f"{method.upper()}:{url}:{params_str}" # Use Python's hash function with a fixed seed for deterministic results # Convert to positive hex string for compatibility hash_value = hash(key_source) return f"{hash_value & 0xFFFFFFFFFFFFFFFF:016x}" def cache_response(cache_key: str, content: str, ttl: int): expire = None if ttl == -1 else ttl cache = get_cache() cache.set(cache_key, content, expire=expire) def get_cached_response(cache_key: str) -> str | None: cache = get_cache() return cache.get(cache_key) def get_ssl_context(tls_version: TLSVersion) -> SSLContext: """Create an SSLContext with the specified TLS version.""" context = SSLContext(PROTOCOL_TLS_CLIENT) context.minimum_version = tls_version context.maximum_version = tls_version context.load_verify_locations(cafile=certifi.where()) return context async def call_http( method: str, url: str, params: dict, verify: ssl.SSLContext | str | bool = True, retry_config: RetryConfig | None = None, headers: dict[str, str] | None = None, ) -> tuple[int, str]: """Make HTTP request with optional retry logic. Args: method: HTTP method (GET or POST) url: Target URL params: Request parameters verify: SSL verification settings retry_config: Retry configuration (if None, no retry) Returns: Tuple of (status_code, response_text) """ async def _make_request() -> tuple[int, str]: # Extract domain from URL for metrics tagging from urllib.parse import urlparse parsed = urlparse(url) host = parsed.hostname or "unknown" # Apply circuit breaker for the host breaker_config = CircuitBreakerConfig( failure_threshold=DEFAULT_FAILURE_THRESHOLD, recovery_timeout=DEFAULT_RECOVERY_TIMEOUT, success_threshold=DEFAULT_SUCCESS_THRESHOLD, expected_exception=(ConnectionError, TimeoutError), ) @circuit_breaker(f"http_{host}", breaker_config) async def _execute_with_breaker(): async with Timer( "http_request", tags={"method": method, "host": host} ): return await execute_http_request( method, url, params, verify, headers ) status, text = await _execute_with_breaker() # Check if status code should trigger retry if retry_config and is_retryable_status(status, retry_config): raise RetryableHTTPError(status, text) return status, text # Apply retry logic if configured if retry_config: wrapped_func = with_retry(retry_config)(_make_request) try: return await wrapped_func() except RetryableHTTPError as exc: # Convert retryable HTTP errors back to status/text return exc.status_code, exc.message except Exception: # Let other exceptions bubble up raise else: return await _make_request() def _handle_offline_mode( url: str, method: str, request: BaseModel | dict, cache_ttl: int, response_model_type: type[T] | None, ) -> tuple[T | None, RequestError | None] | None: """Handle offline mode logic. Returns None if not in offline mode.""" if os.getenv("BIOMCP_OFFLINE", "").lower() not in ("true", "1", "yes"): return None # In offline mode, only return cached responses if cache_ttl > 0: cache_key = generate_cache_key( method, url, request if isinstance(request, dict) else request.model_dump(exclude_none=True, by_alias=True), ) cached_content = get_cached_response(cache_key) if cached_content: return parse_response(200, cached_content, response_model_type) return None, RequestError( code=503, message=f"Offline mode enabled (BIOMCP_OFFLINE=true). Cannot fetch from {url}", ) def _validate_endpoint(endpoint_key: str | None) -> None: """Validate endpoint key if provided.""" if endpoint_key: registry = get_registry() if endpoint_key not in registry.get_all_endpoints(): raise ValueError( f"Unknown endpoint key: {endpoint_key}. Please register in endpoint_registry.py" ) def _prepare_request_params( request: BaseModel | dict, ) -> tuple[dict, dict | None]: """Convert request to params dict and extract headers.""" if isinstance(request, BaseModel): params = request.model_dump(exclude_none=True, by_alias=True) else: params = request.copy() if isinstance(request, dict) else request # Extract headers if present headers = None if isinstance(params, dict) and "_headers" in params: try: import json headers = json.loads(params.pop("_headers")) except (json.JSONDecodeError, TypeError): pass # Ignore invalid headers return params, headers def _get_retry_config( enable_retry: bool, domain: str | None ) -> RetryConfig | None: """Get retry configuration based on settings.""" if not enable_retry: return None # Use more aggressive retry for certain domains if domain in ["clinicaltrials", "pubmed", "myvariant"]: return RetryConfig( max_attempts=AGGRESSIVE_MAX_RETRY_ATTEMPTS, initial_delay=AGGRESSIVE_INITIAL_RETRY_DELAY, max_delay=AGGRESSIVE_MAX_RETRY_DELAY, ) return RetryConfig() # Default settings async def request_api( url: str, request: BaseModel | dict, response_model_type: type[T] | None = None, method: Literal["GET", "POST"] = "GET", cache_ttl: int = DEFAULT_CACHE_TIMEOUT, tls_version: TLSVersion | None = None, domain: str | None = None, enable_retry: bool = True, endpoint_key: str | None = None, ) -> tuple[T | None, RequestError | None]: # Handle offline mode offline_result = _handle_offline_mode( url, method, request, cache_ttl, response_model_type ) if offline_result is not None: return offline_result # Validate endpoint _validate_endpoint(endpoint_key) # Apply rate limiting if domain is specified if domain: async with domain_limiter.limit(domain): pass # Rate limit acquired # Prepare request verify = get_ssl_context(tls_version) if tls_version else True params, headers = _prepare_request_params(request) retry_config = _get_retry_config(enable_retry, domain) # Short-circuit if caching disabled if cache_ttl == 0: status, content = await call_http( method, url, params, verify=verify, retry_config=retry_config, headers=headers, ) return parse_response(status, content, response_model_type) # Handle caching cache_key = generate_cache_key(method, url, params) cached_content = get_cached_response(cache_key) if cached_content: return parse_response(200, cached_content, response_model_type) # Make HTTP request if not cached status, content = await call_http( method, url, params, verify=verify, retry_config=retry_config, headers=headers, ) parsed_response = parse_response(status, content, response_model_type) # Cache if successful response if status == 200: cache_response(cache_key, content, cache_ttl) return parsed_response def parse_response( status_code: int, content: str, response_model_type: type[T] | None = None, ) -> tuple[T | None, RequestError | None]: if status_code != 200: return None, RequestError(code=status_code, message=content) # Handle empty content if not content or content.strip() == "": return None, RequestError( code=500, message="Empty response received from API", ) try: if response_model_type is None: # Try to parse as JSON first if content.startswith("{") or content.startswith("["): response_dict = json.loads(content) elif "," in content: io = StringIO(content) response_dict = list(csv.DictReader(io)) else: response_dict = {"text": content} return response_dict, None parsed: T = response_model_type.model_validate_json(content) return parsed, None except json.JSONDecodeError as exc: # Provide more detailed error message for JSON parsing issues return None, RequestError( code=500, message=f"Invalid JSON response: {exc}. Content preview: {content[:100]}...", ) except Exception as exc: return None, RequestError( code=500, message=f"Failed to parse response: {exc}", ) ``` -------------------------------------------------------------------------------- /src/biomcp/diseases/search.py: -------------------------------------------------------------------------------- ```python """Search functionality for diseases via NCI CTS API.""" import logging from typing import Any from ..constants import NCI_DISEASES_URL from ..integrations.cts_api import CTSAPIError, make_cts_request from ..utils import parse_or_query logger = logging.getLogger(__name__) def _build_disease_params( name: str | None, disease_type: str | None, category: str | None, codes: list[str] | None, parent_ids: list[str] | None, ancestor_ids: list[str] | None, include: list[str] | None, sort: str | None, order: str | None, page_size: int, ) -> dict[str, Any]: """Build query parameters for disease search.""" params: dict[str, Any] = {"size": page_size} if name: params["name"] = name # Use 'type' parameter instead of 'category' if disease_type: params["type"] = disease_type elif category: # Backward compatibility params["type"] = category if codes: params["codes"] = ",".join(codes) if isinstance(codes, list) else codes if parent_ids: params["parent_ids"] = ( ",".join(parent_ids) if isinstance(parent_ids, list) else parent_ids ) if ancestor_ids: params["ancestor_ids"] = ( ",".join(ancestor_ids) if isinstance(ancestor_ids, list) else ancestor_ids ) if include: params["include"] = ( ",".join(include) if isinstance(include, list) else include ) if sort: params["sort"] = sort if order: params["order"] = order.lower() return params async def search_diseases( name: str | None = None, include_synonyms: bool = True, # Deprecated - kept for backward compatibility category: str | None = None, disease_type: str | None = None, codes: list[str] | None = None, parent_ids: list[str] | None = None, ancestor_ids: list[str] | None = None, include: list[str] | None = None, sort: str | None = None, order: str | None = None, page_size: int = 20, page: int = 1, api_key: str | None = None, ) -> dict[str, Any]: """ Search for diseases in the NCI CTS database. This provides access to NCI's controlled vocabulary of cancer conditions used in clinical trials, with official terms and synonyms. Args: name: Disease name to search for (partial match, searches synonyms automatically) include_synonyms: [Deprecated] This parameter is ignored - API always searches synonyms category: Disease category/type filter (deprecated - use disease_type) disease_type: Type of disease (e.g., 'maintype', 'subtype', 'stage') codes: List of disease codes (e.g., ['C3868', 'C5806']) parent_ids: List of parent disease IDs ancestor_ids: List of ancestor disease IDs include: Fields to include in response sort: Sort field order: Sort order ('asc' or 'desc') page_size: Number of results per page page: Page number api_key: Optional API key (if not provided, uses NCI_API_KEY env var) Returns: Dictionary with search results containing: - diseases: List of disease records with names and synonyms - total: Total number of results - page: Current page - page_size: Results per page Raises: CTSAPIError: If the API request fails """ # Build query parameters params = _build_disease_params( name, disease_type, category, codes, parent_ids, ancestor_ids, include, sort, order, page_size, ) try: # Make API request response = await make_cts_request( url=NCI_DISEASES_URL, params=params, api_key=api_key, ) # Process response diseases = response.get("data", response.get("diseases", [])) total = response.get("total", len(diseases)) return { "diseases": diseases, "total": total, "page": page, "page_size": page_size, } except CTSAPIError: raise except Exception as e: logger.error(f"Failed to search diseases: {e}") raise CTSAPIError(f"Disease search failed: {e!s}") from e async def get_disease_by_id( disease_id: str, api_key: str | None = None, ) -> dict[str, Any]: """ Get detailed information about a specific disease by ID. Args: disease_id: Disease ID from NCI CTS api_key: Optional API key (if not provided, uses NCI_API_KEY env var) Returns: Dictionary with disease details including synonyms Raises: CTSAPIError: If the API request fails """ try: # Make API request url = f"{NCI_DISEASES_URL}/{disease_id}" response = await make_cts_request( url=url, api_key=api_key, ) # Return the disease data if "data" in response: return response["data"] elif "disease" in response: return response["disease"] else: return response except CTSAPIError: raise except Exception as e: logger.error(f"Failed to get disease {disease_id}: {e}") raise CTSAPIError(f"Failed to retrieve disease: {e!s}") from e def _format_disease_synonyms(synonyms: Any) -> list[str]: """Format disease synonyms section.""" lines: list[str] = [] if not synonyms: return lines if isinstance(synonyms, list) and synonyms: lines.append("- **Synonyms**:") for syn in synonyms[:5]: # Show up to 5 synonyms lines.append(f" - {syn}") if len(synonyms) > 5: lines.append(f" *(and {len(synonyms) - 5} more)*") elif isinstance(synonyms, str): lines.append(f"- **Synonyms**: {synonyms}") return lines def _format_disease_codes(codes: Any) -> list[str]: """Format disease code mappings.""" if not codes or not isinstance(codes, dict): return [] code_items = [] for system, code in codes.items(): code_items.append(f"{system}: {code}") if code_items: return [f"- **Codes**: {', '.join(code_items)}"] return [] def _format_single_disease(disease: dict[str, Any]) -> list[str]: """Format a single disease record.""" disease_id = disease.get("id", disease.get("disease_id", "Unknown")) name = disease.get( "name", disease.get("preferred_name", "Unknown Disease") ) category = disease.get("category", disease.get("type", "")) lines = [ f"### {name}", f"- **ID**: {disease_id}", ] if category: lines.append(f"- **Category**: {category}") # Add synonyms lines.extend(_format_disease_synonyms(disease.get("synonyms", []))) # Add code mappings lines.extend(_format_disease_codes(disease.get("codes"))) lines.append("") return lines def format_disease_results(results: dict[str, Any]) -> str: """ Format disease search results as markdown. Args: results: Search results dictionary Returns: Formatted markdown string """ diseases = results.get("diseases", []) total = results.get("total", 0) if not diseases: return "No diseases found matching the search criteria." # Build markdown output lines = [ f"## Disease Search Results ({total} found)", "", ] for disease in diseases: lines.extend(_format_single_disease(disease)) return "\n".join(lines) async def search_diseases_with_or( name_query: str, include_synonyms: bool = True, category: str | None = None, disease_type: str | None = None, codes: list[str] | None = None, parent_ids: list[str] | None = None, ancestor_ids: list[str] | None = None, include: list[str] | None = None, sort: str | None = None, order: str | None = None, page_size: int = 20, page: int = 1, api_key: str | None = None, ) -> dict[str, Any]: """ Search for diseases with OR query support. This function handles OR queries by making multiple API calls and combining results. For example: "melanoma OR lung cancer" will search for each term. Args: name_query: Name query that may contain OR operators Other args same as search_diseases Returns: Combined results from all searches with duplicates removed """ # Check if this is an OR query if " OR " in name_query or " or " in name_query: search_terms = parse_or_query(name_query) logger.info(f"Parsed OR query into terms: {search_terms}") else: # Single term search search_terms = [name_query] # Collect all unique diseases all_diseases = {} total_found = 0 # Search for each term for term in search_terms: logger.info(f"Searching diseases for term: {term}") try: results = await search_diseases( name=term, include_synonyms=include_synonyms, category=category, disease_type=disease_type, codes=codes, parent_ids=parent_ids, ancestor_ids=ancestor_ids, include=include, sort=sort, order=order, page_size=page_size, page=page, api_key=api_key, ) # Add unique diseases (deduplicate by ID) for disease in results.get("diseases", []): disease_id = disease.get("id", disease.get("disease_id")) if disease_id and disease_id not in all_diseases: all_diseases[disease_id] = disease total_found += results.get("total", 0) except Exception as e: logger.warning(f"Failed to search for term '{term}': {e}") # Continue with other terms # Convert back to list and apply pagination unique_diseases = list(all_diseases.values()) # Sort by name for consistent results unique_diseases.sort( key=lambda x: x.get("name", x.get("preferred_name", "")).lower() ) # Apply pagination to combined results start_idx = (page - 1) * page_size end_idx = start_idx + page_size paginated_diseases = unique_diseases[start_idx:end_idx] return { "diseases": paginated_diseases, "total": len(unique_diseases), "page": page, "page_size": page_size, "search_terms": search_terms, # Include what we searched for "total_found_across_terms": total_found, # Total before deduplication } ``` -------------------------------------------------------------------------------- /docs/tutorials/openfda-integration.md: -------------------------------------------------------------------------------- ```markdown # OpenFDA Integration Guide ## Overview BioMCP now integrates with the FDA's openFDA API to provide access to critical drug safety and regulatory information. This integration adds three major data sources to BioMCP's capabilities: 1. **Drug Adverse Events (FAERS)** - FDA Adverse Event Reporting System data 2. **Drug Labels (SPL)** - Official FDA drug product labeling 3. **Device Events (MAUDE)** - Medical device adverse event reports This guide covers how to use these new tools effectively for precision oncology research. ## Quick Start ### Installation & Setup The OpenFDA integration is included in the standard BioMCP installation: ```bash # Install BioMCP pip install biomcp-python # Optional: Set API key for higher rate limits export OPENFDA_API_KEY="your-api-key-here" ``` > **Note**: An API key is optional but recommended. Without one, you're limited to 40 requests/minute. With a key, you get 240 requests/minute. [Get a free API key here](https://open.fda.gov/apis/authentication/). ### Basic Usage Examples #### Search for drug adverse events ```bash # Find adverse events for a specific drug biomcp openfda adverse search --drug imatinib # Search for specific reactions biomcp openfda adverse search --reaction nausea --serious # Get detailed report biomcp openfda adverse get REPORT123456 ``` #### Search drug labels ```bash # Find drugs for specific indications biomcp openfda label search --indication melanoma # Search for drugs with boxed warnings biomcp openfda label search --boxed-warning # Get complete label biomcp openfda label get SET_ID_HERE ``` #### Search device events ```bash # Search for genomic test device issues biomcp openfda device search --device "FoundationOne" # Search by manufacturer biomcp openfda device search --manufacturer Illumina # Get detailed device event biomcp openfda device get MDR123456 ``` ## MCP Tool Usage ### For AI Agents The OpenFDA tools are available as MCP tools for AI agents. Each tool includes built-in reminders to use the `think` tool first for complex queries. #### Available Tools - `openfda_adverse_searcher` - Search drug adverse events - `openfda_adverse_getter` - Get specific adverse event report - `openfda_label_searcher` - Search drug labels - `openfda_label_getter` - Get complete drug label - `openfda_device_searcher` - Search device adverse events - `openfda_device_getter` - Get specific device event report #### Example Tool Usage ```python # Search for adverse events result = await openfda_adverse_searcher( drug="pembrolizumab", serious=True, limit=25 ) # Get drug label label = await openfda_label_getter( set_id="abc-123-def", sections=["indications_and_usage", "warnings_and_precautions"] ) # Search genomic devices devices = await openfda_device_searcher( device="sequencer", genomics_only=True, # Filter to genomic/diagnostic devices problem="false positive" ) ``` ## Data Sources Explained ### Drug Adverse Events (FAERS) The FDA Adverse Event Reporting System contains reports of adverse events and medication errors submitted to FDA. Key features: - **Voluntary reporting**: Reports come from healthcare professionals, patients, and manufacturers - **No causation proof**: Reports don't establish that a drug caused the event - **Rich detail**: Includes patient demographics, drug information, reactions, and outcomes - **Real-world data**: Captures post-market safety signals **Best for**: Understanding potential side effects, safety signals, drug interactions ### Drug Labels (SPL) Structured Product Labeling contains the official FDA-approved prescribing information. Includes: - **Indications and usage**: FDA-approved uses - **Dosage and administration**: How to prescribe - **Contraindications**: When not to use - **Warnings and precautions**: Safety information - **Drug interactions**: Known interactions - **Clinical studies**: Trial data supporting approval **Best for**: Official prescribing guidelines, approved indications, contraindications ### Device Events (MAUDE) Manufacturer and User Facility Device Experience database contains medical device adverse events. For BioMCP, we focus on genomic/diagnostic devices: - **Genomic test devices**: Issues with sequencing platforms, diagnostic panels - **In vitro diagnostics**: Problems with biomarker tests - **Device malfunctions**: Technical failures affecting test results - **Patient impact**: How device issues affected patient care **Best for**: Understanding reliability of genomic tests, device-related diagnostic issues ## Advanced Features ### Genomic Device Filtering By default, device searches filter to genomic/diagnostic devices relevant to precision oncology: ```bash # Search only genomic devices (default) biomcp openfda device search --device test # Search ALL medical devices biomcp openfda device search --device test --all-devices ``` The genomic filter includes FDA product codes for: - Next Generation Sequencing panels - Gene mutation detection systems - Tumor profiling tests - Hereditary variant detection systems ### Pagination Support All search tools support pagination for large result sets: ```bash # Get second page of results biomcp openfda adverse search --drug aspirin --page 2 --limit 50 ``` ### Section-Specific Label Retrieval When retrieving drug labels, you can specify which sections to include: ```bash # Get only specific sections biomcp openfda label get SET_ID --sections "indications_and_usage,adverse_reactions" ``` ## Integration with Other BioMCP Tools ### Complementary Data Sources OpenFDA data complements existing BioMCP tools: | Tool | Data Source | Best For | | -------------------------- | ------------------ | --------------------------------- | | `drug_getter` | MyChem.info | Chemical properties, mechanisms | | `openfda_label_searcher` | FDA Labels | Official indications, prescribing | | `openfda_adverse_searcher` | FAERS | Safety signals, side effects | | `trial_searcher` | ClinicalTrials.gov | Active trials, eligibility | ### Workflow Examples #### Complete Drug Profile ```python # 1. Get drug chemical info drug_info = await drug_getter("imatinib") # 2. Get FDA label label = await openfda_label_searcher(name="imatinib") # 3. Check adverse events safety = await openfda_adverse_searcher(drug="imatinib", serious=True) # 4. Find current trials trials = await trial_searcher(interventions=["imatinib"]) ``` #### Device Reliability Check ```python # 1. Search for device issues events = await openfda_device_searcher( device="FoundationOne CDx", problem="false" ) # 2. Get specific event details if events: details = await openfda_device_getter("MDR_KEY_HERE") ``` ## Important Considerations ### Data Limitations 1. **Adverse Events**: - Reports don't prove causation - Reporting is voluntary, so not all events are captured - Duplicate reports may exist - Include appropriate disclaimers when presenting data 2. **Drug Labels**: - May not reflect the most recent changes - Off-label uses not included - Generic drugs may have different inactive ingredients 3. **Device Events**: - Not all device problems are reported - User error vs device malfunction can be unclear - Reports may lack complete information ### Rate Limits - **Without API key**: 40 requests/minute per IP - **With API key**: 240 requests/minute per key - **Burst limit**: 4 requests/second ### Best Practices 1. **Always use disclaimers**: Include FDA's disclaimer about adverse events not proving causation 2. **Check multiple sources**: Combine OpenFDA data with other BioMCP tools 3. **Filter appropriately**: Use genomic device filtering for relevant results 4. **Handle no results gracefully**: Many specific queries may return no results 5. **Respect rate limits**: Use API key for production use ## Troubleshooting ### Common Issues **No results found** - Try broader search terms - Check spelling of drug/device names - Remove filters to expand search **Rate limit errors** - Add API key to environment - Reduce request frequency - Batch queries when possible **Timeout errors** - OpenFDA API may be slow/down - Retry after a brief wait - Consider caching frequent queries ### Getting Help - OpenFDA documentation: https://open.fda.gov/apis/ - OpenFDA status: https://api.fda.gov/status - BioMCP issues: https://github.com/genomoncology/biomcp/issues ## API Reference ### Environment Variables - `OPENFDA_API_KEY`: Your openFDA API key (optional but recommended) ### CLI Commands ```bash # Adverse Events biomcp openfda adverse search [OPTIONS] --drug TEXT Drug name to search --reaction TEXT Reaction to search --serious/--all Filter serious events --limit INT Results per page (max 100) --page INT Page number biomcp openfda adverse get REPORT_ID # Drug Labels biomcp openfda label search [OPTIONS] --name TEXT Drug name --indication TEXT Indication to search --boxed-warning Has boxed warning --section TEXT Label section --limit INT Results per page --page INT Page number biomcp openfda label get SET_ID [OPTIONS] --sections TEXT Comma-separated sections # Device Events biomcp openfda device search [OPTIONS] --device TEXT Device name --manufacturer TEXT Manufacturer name --problem TEXT Problem description --product-code TEXT FDA product code --genomics-only/--all-devices --limit INT Results per page --page INT Page number biomcp openfda device get MDR_KEY ``` ## Example Outputs ### Adverse Event Search ```markdown ## FDA Adverse Event Reports **Drug**: imatinib | **Serious Events**: Yes **Total Reports Found**: 1,234 reports ### Top Reported Reactions: - **NAUSEA**: 234 reports (19.0%) - **FATIGUE**: 189 reports (15.3%) - **RASH**: 156 reports (12.6%) ### Sample Reports (showing 3 of 1,234): ... ``` ### Drug Label Search ```markdown ## FDA Drug Labels **Drug**: pembrolizumab **Total Labels Found**: 5 labels ### Results (showing 5 of 5): #### 1. KEYTRUDA **Also known as**: pembrolizumab **FDA Application**: BLA125514 **Manufacturer**: Merck Sharp & Dohme **Route**: INTRAVENOUS ⚠️ **BOXED WARNING**: Immune-mediated adverse reactions... **Indications**: KEYTRUDA is indicated for the treatment of... ``` ### Device Event Search ```markdown ## FDA Device Adverse Event Reports **Device**: FoundationOne | **Type**: Genomic/Diagnostic Devices **Total Reports Found**: 12 reports ### Top Reported Problems: - **False negative result**: 5 reports (41.7%) - **Software malfunction**: 3 reports (25.0%) ### Sample Reports (showing 3 of 12): ... ``` ``` -------------------------------------------------------------------------------- /docs/how-to-guides/02-find-trials-with-nci-and-biothings.md: -------------------------------------------------------------------------------- ```markdown # How to Find Trials with NCI and BioThings This guide demonstrates how to search for clinical trials using BioMCP's dual data sources and automatic disease synonym expansion. ## Overview BioMCP provides access to clinical trials through: - **ClinicalTrials.gov**: Default source with comprehensive U.S. and international trials ([API Reference](../backend-services-reference/04-clinicaltrials-gov.md)) - **NCI CTS API**: Advanced cancer trial search with biomarker filtering (requires API key) ([API Reference](../backend-services-reference/05-nci-cts-api.md)) - **BioThings Integration**: Automatic disease synonym expansion for better coverage ([BioThings Reference](../backend-services-reference/02-biothings-suite.md)) ## Basic Trial Search ### Simple Disease Search Find trials for a specific condition: ```bash # CLI biomcp trial search --condition melanoma --status RECRUITING # Python trials = await client.trials.search( conditions=["melanoma"], recruiting_status="RECRUITING" ) # MCP Tool trial_searcher( conditions=["melanoma"], recruiting_status="OPEN" ) ``` ### Search by Intervention Find trials testing specific drugs: ```bash # CLI biomcp trial search --intervention pembrolizumab --phase PHASE3 # Python trials = await client.trials.search( interventions=["pembrolizumab"], phase="PHASE3" ) ``` ## Location-Based Search ### Finding Nearby Trials **Important**: Location searches require latitude and longitude coordinates. ```python # Find trials near Cleveland, Ohio trials = await trial_searcher( conditions=["lung cancer"], lat=41.4993, long=-81.6944, distance=50 # 50 miles radius ) # Find trials near Boston trials = await trial_searcher( conditions=["breast cancer"], lat=42.3601, long=-71.0589, distance=25 ) ``` ### Getting Coordinates For common locations: - Cleveland: lat=41.4993, long=-81.6944 - Boston: lat=42.3601, long=-71.0589 - New York: lat=40.7128, long=-74.0060 - Los Angeles: lat=34.0522, long=-118.2437 - Houston: lat=29.7604, long=-95.3698 ## Advanced Filtering ### Multiple Criteria Combine multiple filters for precise results: ```python # Complex search example trials = await trial_searcher( conditions=["non-small cell lung cancer", "NSCLC"], interventions=["pembrolizumab", "immunotherapy"], phase="PHASE3", recruiting_status="OPEN", age_group="ADULT", study_type="INTERVENTIONAL", funder_type="INDUSTRY" ) ``` ### Date-Based Filtering Find recently started trials: ```bash # CLI - Trials started in 2024 biomcp trial search \ --condition cancer \ --start-date 2024-01-01 \ --status RECRUITING ``` ## Using NCI API Advanced Features ### Setup NCI API Key Get your key from [api.cancer.gov](https://api.cancer.gov). For detailed setup instructions, see [Authentication and API Keys](../getting-started/03-authentication-and-api-keys.md#nci-clinical-trials-api): ```bash export NCI_API_KEY="your-key-here" ``` ### Biomarker-Based Search Find trials for specific mutations: ```python # Search using NCI source trials = await search( domain="trial", source="nci", conditions=["melanoma"], required_mutations=["BRAF V600E"], allow_brain_mets=True, api_key="your-key" ) ``` ### NCI-Specific Parameters ```python # Advanced NCI search trials = await trial_searcher( source="nci", conditions=["lung cancer"], required_mutations=["EGFR L858R", "EGFR exon 19 deletion"], prior_therapy_required=False, allow_brain_mets=True, allow_prior_immunotherapy=False, api_key="your-key" ) ``` ## BioThings Integration for Enhanced Search For technical details on the BioThings APIs, see: - [BioThings Suite Reference](../backend-services-reference/02-biothings-suite.md) ### Automatic Disease Synonym Expansion BioMCP automatically expands disease terms using MyDisease.info: ```python # Searching for "GIST" automatically includes: # - "gastrointestinal stromal tumor" # - "gastrointestinal stromal tumour" # - "GI stromal tumor" trials = await trial_searcher(conditions=["GIST"]) ``` ### Manual Disease Lookup Get all synonyms for a disease: ```python # Get disease information disease_info = await disease_getter("melanoma") # Extract synonyms synonyms = disease_info.synonyms # Returns: ["malignant melanoma", "melanoma, malignant", ...] # Use in trial search trials = await trial_searcher(conditions=synonyms) ``` ## Practical Workflows ### Workflow 1: Patient-Centric Trial Search Find trials for a specific patient profile: ```python async def find_trials_for_patient( disease: str, mutations: list[str], location: tuple[float, float], prior_treatments: list[str] ): # Step 1: Think about the search await think( thought=f"Searching trials for {disease} with {mutations}", thoughtNumber=1 ) # Step 2: Get disease synonyms disease_info = await disease_getter(disease) all_conditions = [disease] + disease_info.synonyms # Step 3: Search both sources # ClinicalTrials.gov ctgov_trials = await trial_searcher( conditions=all_conditions, other_terms=mutations, lat=location[0], long=location[1], distance=100, recruiting_status="OPEN" ) # NCI (if API key available) if os.getenv("NCI_API_KEY"): nci_trials = await trial_searcher( source="nci", conditions=all_conditions, required_mutations=mutations, exclude_prior_therapy=prior_treatments, api_key=os.getenv("NCI_API_KEY") ) return { "clinicaltrials_gov": ctgov_trials, "nci": nci_trials } # Example usage trials = await find_trials_for_patient( disease="melanoma", mutations=["BRAF V600E"], location=(40.7128, -74.0060), # New York prior_treatments=["vemurafenib"] ) ``` ### Workflow 2: Research Landscape Analysis Understand ongoing research in a field: ```python async def analyze_research_landscape(gene: str, disease: str): # Get gene information gene_info = await gene_getter(gene) # Find all active trials all_trials = await trial_searcher( conditions=[disease], other_terms=[gene, f"{gene} mutation", f"{gene} positive"], recruiting_status="OPEN", page_size=50 ) # Categorize by phase phase_distribution = {} for trial in all_trials: phase = trial.phase or "Not specified" phase_distribution[phase] = phase_distribution.get(phase, 0) + 1 # Extract unique interventions interventions = set() for trial in all_trials: if trial.interventions: interventions.update(trial.interventions) return { "total_trials": len(all_trials), "phase_distribution": phase_distribution, "unique_interventions": list(interventions), "gene_info": gene_info } # Example landscape = await analyze_research_landscape("ALK", "lung cancer") ``` ### Workflow 3: Biomarker-Driven Search Find trials based on specific biomarkers: ```python async def biomarker_trial_search(biomarkers: list[str], cancer_type: str): # Search NCI biomarker database biomarker_results = [] for biomarker in biomarkers: result = await nci_biomarker_searcher( name=biomarker, api_key=os.getenv("NCI_API_KEY") ) biomarker_results.extend(result) # Extract associated trials trial_ids = set() for bio in biomarker_results: if bio.get("associated_trials"): trial_ids.update(bio["associated_trials"]) # Get trial details trials = [] for nct_id in trial_ids: trial = await trial_getter(nct_id) trials.append(trial) return trials # Example trials = await biomarker_trial_search( biomarkers=["PD-L1", "TMB-high", "MSI-H"], cancer_type="colorectal cancer" ) ``` ## Working with Trial Results ### Extracting Key Information ```python # Process trial results for trial in trials: print(f"NCT ID: {trial.nct_id}") print(f"Title: {trial.title}") print(f"Status: {trial.status}") print(f"Phase: {trial.phase}") # Locations if trial.locations: print("Locations:") for loc in trial.locations: print(f" - {loc.facility}, {loc.city}, {loc.state}") # Eligibility if trial.eligibility: print(f"Age: {trial.eligibility.minimum_age} - {trial.eligibility.maximum_age}") print(f"Sex: {trial.eligibility.sex}") ``` ### Getting Detailed Trial Information ```python # Get complete trial details full_trial = await trial_getter("NCT03006926") # Get specific sections protocol = await trial_protocol_getter("NCT03006926") locations = await trial_locations_getter("NCT03006926") outcomes = await trial_outcomes_getter("NCT03006926") references = await trial_references_getter("NCT03006926") ``` ## Tips for Effective Trial Searches ### 1. Use Multiple Search Terms ```python # Cover variations trials = await trial_searcher( conditions=["NSCLC", "non-small cell lung cancer", "lung adenocarcinoma"], interventions=["anti-PD-1", "pembrolizumab", "Keytruda"] ) ``` ### 2. Check Both Data Sources ```python # Some trials may only be in one database ctgov_count = len(await trial_searcher(source="ctgov", conditions=["melanoma"])) nci_count = len(await trial_searcher(source="nci", conditions=["melanoma"])) ``` ### 3. Use Appropriate Filters - **recruiting_status**: Focus on trials accepting patients - **phase**: Later phases for established treatments - **age_group**: Match patient demographics - **study_type**: INTERVENTIONAL vs OBSERVATIONAL ### 4. Leverage Location Search Always include location for patient-specific searches: ```python # Bad - no location trials = await trial_searcher(conditions=["cancer"]) # Good - includes location trials = await trial_searcher( conditions=["cancer"], lat=40.7128, long=-74.0060, distance=50 ) ``` ## Troubleshooting ### No Results Found 1. **Broaden search terms**: Remove specific filters 2. **Check synonyms**: Use disease_getter to find alternatives 3. **Expand location**: Increase distance parameter 4. **Try both sources**: Some trials only in NCI or ClinicalTrials.gov ### Location Search Issues - Ensure both latitude AND longitude are provided - Use decimal degrees (not degrees/minutes/seconds) - Check coordinate signs (negative for West/South) ### NCI API Errors - Verify API key is valid - Check rate limits (1000 requests/day with key) - Some features require specific API key permissions ## Next Steps - Learn about [variant annotations](03-get-comprehensive-variant-annotations.md) - Explore [AlphaGenome predictions](04-predict-variant-effects-with-alphagenome.md) - Set up [monitoring and logging](05-logging-and-monitoring-with-bigquery.md) ``` -------------------------------------------------------------------------------- /src/biomcp/variants/search.py: -------------------------------------------------------------------------------- ```python import json import logging from typing import Annotated, Any from pydantic import BaseModel, Field, model_validator from .. import StrEnum, ensure_list, http_client, render from ..constants import MYVARIANT_QUERY_URL, SYSTEM_PAGE_SIZE from .filters import filter_variants from .links import inject_links logger = logging.getLogger(__name__) class ClinicalSignificance(StrEnum): PATHOGENIC = "pathogenic" LIKELY_PATHOGENIC = "likely pathogenic" UNCERTAIN_SIGNIFICANCE = "uncertain significance" LIKELY_BENIGN = "likely benign" BENIGN = "benign" class PolyPhenPrediction(StrEnum): PROBABLY_DAMAGING = "D" POSSIBLY_DAMAGING = "P" BENIGN = "B" class SiftPrediction(StrEnum): DELETERIOUS = "D" TOLERATED = "T" class VariantSources(StrEnum): CADD = "cadd" CGI = "cgi" CIVIC = "civic" CLINVAR = "clinvar" COSMIC = "cosmic" DBNSFP = "dbnsfp" DBSNP = "dbsnp" DOCM = "docm" EMV = "evm" EXAC = "exac" GNOMAD_EXOME = "gnomad_exome" HG19 = "hg19" MUTDB = "mutdb" SNPEFF = "snpeff" VCF = "vcf" MYVARIANT_FIELDS = [ "_id", "chrom", "vcf.position", "vcf.ref", "vcf.alt", "cadd.phred", "civic.id", "civic.openCravatUrl", "clinvar.rcv.clinical_significance", "clinvar.variant_id", "cosmic.cosmic_id", "dbnsfp.genename", "dbnsfp.hgvsc", "dbnsfp.hgvsp", "dbnsfp.polyphen2.hdiv.pred", "dbnsfp.polyphen2.hdiv.score", "dbnsfp.sift.pred", "dbnsfp.sift.score", "dbsnp.rsid", "exac.af", "gnomad_exome.af.af", ] class VariantQuery(BaseModel): """Search parameters for querying variant data from MyVariant.info.""" gene: str | None = Field( default=None, description="Gene symbol to search for (e.g. BRAF, TP53)", ) hgvsp: str | None = Field( default=None, description="Protein change notation (e.g., p.V600E, p.Arg557His)", ) hgvsc: str | None = Field( default=None, description="cDNA notation (e.g., c.1799T>A)", ) rsid: str | None = Field( default=None, description="dbSNP rsID (e.g., rs113488022)", ) region: str | None = Field( default=None, description="Genomic region as chr:start-end (e.g. chr1:12345-67890)", ) significance: ClinicalSignificance | None = Field( default=None, description="ClinVar clinical significance", ) max_frequency: float | None = Field( default=None, description="Maximum population allele frequency threshold", ) min_frequency: float | None = Field( default=None, description="Minimum population allele frequency threshold", ) cadd: float | None = Field( default=None, description="Minimum CADD phred score", ) polyphen: PolyPhenPrediction | None = Field( default=None, description="PolyPhen-2 prediction", ) sift: SiftPrediction | None = Field( default=None, description="SIFT prediction", ) sources: list[VariantSources] = Field( description="Include only specific data sources", default_factory=list, ) size: int = Field( default=SYSTEM_PAGE_SIZE, description="Number of results to return", ) offset: int = Field( default=0, description="Result offset for pagination", ) @model_validator(mode="after") def validate_query_params(self) -> "VariantQuery": if not self.model_dump(exclude_none=True, exclude_defaults=True): raise ValueError("At least one search parameter is required") return self def _construct_query_part( field: str, val: Any | None, operator: str | None = None, quoted: bool = False, ) -> str | None: if val is not None: val = str(val) val = f'"{val}"' if quoted else val operator = operator or "" val = f"{field}:{operator}{val}" return val def build_query_string(query: VariantQuery) -> str: query_parts: list[str] = list(filter(None, [query.region, query.rsid])) query_params = [ ("dbnsfp.genename", query.gene, None, True), ("dbnsfp.hgvsp", query.hgvsp, None, True), ("dbnsfp.hgvsc", query.hgvsc, None, True), ("dbsnp.rsid", query.rsid, None, True), ("clinvar.rcv.clinical_significance", query.significance, None, True), ("gnomad_exome.af.af", query.max_frequency, "<=", False), ("gnomad_exome.af.af", query.min_frequency, ">=", False), ("cadd.phred", query.cadd, ">=", False), ("dbnsfp.polyphen2.hdiv.pred", query.polyphen, None, True), ("dbnsfp.sift.pred", query.sift, None, True), ] for field, val, operator, quoted in query_params: part = _construct_query_part(field, val, operator, quoted) if part is not None: query_parts.append(part) return " AND ".join(query_parts) if query_parts else "*" async def convert_query(query: VariantQuery) -> dict[str, Any]: """Convert a VariantQuery to parameters for the MyVariant.info API.""" fields = MYVARIANT_FIELDS[:] + [f"{s}.*" for s in query.sources] # Optimize common queries to prevent timeouts query_string = build_query_string(query) # Special handling for common BRAF V600E query if query.gene == "BRAF" and query.hgvsp == "V600E": # Use a more specific query that performs better query_string = 'dbnsfp.genename:"BRAF" AND (dbnsfp.aaref:"V" AND dbnsfp.aapos:600 AND dbnsfp.aaalt:"E")' return { "q": query_string, "size": query.size, "from": query.offset, "fields": ",".join(fields), } async def search_variants( query: VariantQuery, output_json: bool = False, include_cbioportal: bool = True, ) -> str: """Search variants using the MyVariant.info API with optional cBioPortal summary.""" params = await convert_query(query) response, error = await http_client.request_api( url=MYVARIANT_QUERY_URL, request=params, method="GET", domain="myvariant", ) data: list = response.get("hits", []) if response else [] if error: # Provide more specific error messages for common issues if "timed out" in error.message.lower(): error_msg = ( "MyVariant.info API request timed out. This can happen with complex queries. " "Try narrowing your search criteria or searching by specific identifiers (rsID, HGVS)." ) else: error_msg = f"Error {error.code}: {error.message}" data = [{"error": error_msg}] else: data = inject_links(data) data = filter_variants(data) # Get cBioPortal summary if searching by gene cbioportal_summary = None if include_cbioportal and query.gene and not error: try: from .cbioportal_search import ( CBioPortalSearchClient, format_cbioportal_search_summary, ) client = CBioPortalSearchClient() summary = await client.get_gene_search_summary(query.gene) if summary: cbioportal_summary = format_cbioportal_search_summary(summary) except Exception as e: logger.warning(f"Failed to get cBioPortal summary: {e}") if not output_json: result = render.to_markdown(data) if cbioportal_summary: result = cbioportal_summary + "\n\n" + result return result else: if cbioportal_summary: return json.dumps( {"cbioportal_summary": cbioportal_summary, "variants": data}, indent=2, ) return json.dumps(data, indent=2) async def _variant_searcher( call_benefit: Annotated[ str, "Define and summarize why this function is being called and the intended benefit", ], gene: Annotated[ str | None, "Gene symbol to search for (e.g. BRAF, TP53)" ] = None, hgvsp: Annotated[ str | None, "Protein change notation (e.g., p.V600E, p.Arg557His)" ] = None, hgvsc: Annotated[str | None, "cDNA notation (e.g., c.1799T>A)"] = None, rsid: Annotated[str | None, "dbSNP rsID (e.g., rs113488022)"] = None, region: Annotated[ str | None, "Genomic region as chr:start-end (e.g. chr1:12345-67890)" ] = None, significance: Annotated[ ClinicalSignificance | str | None, "ClinVar clinical significance" ] = None, max_frequency: Annotated[ float | None, "Maximum population allele frequency threshold" ] = None, min_frequency: Annotated[ float | None, "Minimum population allele frequency threshold" ] = None, cadd: Annotated[float | None, "Minimum CADD phred score"] = None, polyphen: Annotated[ PolyPhenPrediction | str | None, "PolyPhen-2 prediction" ] = None, sift: Annotated[SiftPrediction | str | None, "SIFT prediction"] = None, sources: Annotated[ list[VariantSources] | list[str] | str | None, "Include only specific data sources (list or comma-separated string)", ] = None, size: Annotated[int, "Number of results to return"] = SYSTEM_PAGE_SIZE, offset: Annotated[int, "Result offset for pagination"] = 0, ) -> str: """ Searches for genetic variants based on specified criteria. Parameters: - call_benefit: Define and summarize why this function is being called and the intended benefit - gene: Gene symbol to search for (e.g. BRAF, TP53) - hgvsp: Protein change notation (e.g., p.V600E, p.Arg557His) - hgvsc: cDNA notation (e.g., c.1799T>A) - rsid: dbSNP rsID (e.g., rs113488022) - region: Genomic region as chr:start-end (e.g. chr1:12345-67890) - significance: ClinVar clinical significance - max_frequency: Maximum population allele frequency threshold - min_frequency: Minimum population allele frequency threshold - cadd: Minimum CADD phred score - polyphen: PolyPhen-2 prediction - sift: SIFT prediction - sources: Include only specific data sources (list or comma-separated string) - size: Number of results to return (default: 10) - offset: Result offset for pagination (default: 0) Returns: Markdown formatted list of matching variants with key annotations """ # Convert individual parameters to a VariantQuery object query = VariantQuery( gene=gene, hgvsp=hgvsp, hgvsc=hgvsc, rsid=rsid, region=region, significance=significance, max_frequency=max_frequency, min_frequency=min_frequency, cadd=cadd, polyphen=polyphen, sift=sift, sources=ensure_list(sources, split_strings=True), size=size, offset=offset, ) return await search_variants( query, output_json=False, include_cbioportal=True ) ``` -------------------------------------------------------------------------------- /tests/tdd/test_mcp_integration.py: -------------------------------------------------------------------------------- ```python """Integration tests for MCP server functionality.""" import json from unittest.mock import patch import pytest from biomcp.core import mcp_app @pytest.mark.asyncio class TestMCPIntegration: """Integration tests for the MCP server.""" async def test_mcp_server_tools_registered(self): """Test that MCP tools are properly registered.""" # Get the registered tools tools = await mcp_app.list_tools() # Should have 35 tools (2 unified + 1 think + 32 individual including OpenFDA) assert len(tools) == 35 # Check tool names tool_names = [tool.name for tool in tools] # Unified tools assert "search" in tool_names assert "fetch" in tool_names assert "think" in tool_names # Individual tools assert "article_searcher" in tool_names assert "article_getter" in tool_names assert "trial_searcher" in tool_names assert "trial_getter" in tool_names assert "trial_protocol_getter" in tool_names assert "trial_references_getter" in tool_names assert "trial_outcomes_getter" in tool_names assert "trial_locations_getter" in tool_names assert "variant_searcher" in tool_names assert "variant_getter" in tool_names assert "alphagenome_predictor" in tool_names assert "gene_getter" in tool_names assert "drug_getter" in tool_names assert "disease_getter" in tool_names # OpenFDA tools assert "openfda_adverse_searcher" in tool_names assert "openfda_adverse_getter" in tool_names assert "openfda_label_searcher" in tool_names assert "openfda_label_getter" in tool_names assert "openfda_device_searcher" in tool_names assert "openfda_device_getter" in tool_names assert "openfda_approval_searcher" in tool_names assert "openfda_approval_getter" in tool_names assert "openfda_recall_searcher" in tool_names assert "openfda_recall_getter" in tool_names assert "openfda_shortage_searcher" in tool_names assert "openfda_shortage_getter" in tool_names async def test_mcp_search_tool_schema(self): """Test the search tool schema.""" tools = await mcp_app.list_tools() search_tool = next(t for t in tools if t.name == "search") # Check required parameters assert "query" in search_tool.inputSchema["properties"] assert "domain" in search_tool.inputSchema["properties"] assert "call_benefit" in search_tool.inputSchema["properties"] # Verify query is required (no default value) assert "query" in search_tool.inputSchema.get("required", []) # Verify call_benefit is optional assert "call_benefit" not in search_tool.inputSchema.get( "required", [] ) # Check domain enum values domain_schema = search_tool.inputSchema["properties"]["domain"] # The enum is nested in anyOf enum_values = domain_schema["anyOf"][0]["enum"] assert "article" in enum_values assert "trial" in enum_values assert "variant" in enum_values # thinking domain was removed from search tool # assert "thinking" in enum_values async def test_mcp_fetch_tool_schema(self): """Test the fetch tool schema.""" tools = await mcp_app.list_tools() fetch_tool = next(t for t in tools if t.name == "fetch") # Check required parameters - only id should be required required = fetch_tool.inputSchema["required"] assert "id" in required assert len(required) == 1 # Only id should be required # Check optional parameters are present assert "domain" in fetch_tool.inputSchema["properties"] assert "call_benefit" in fetch_tool.inputSchema["properties"] assert "detail" in fetch_tool.inputSchema["properties"] # Check domain enum values (no thinking for fetch) domain_schema = fetch_tool.inputSchema["properties"]["domain"] # For required enums, the structure is different if "enum" in domain_schema: enum_values = domain_schema["enum"] else: # Check if it's in anyOf structure enum_values = domain_schema.get("anyOf", [{}])[0].get("enum", []) assert "article" in enum_values assert "trial" in enum_values assert "variant" in enum_values assert "thinking" not in enum_values async def test_mcp_search_article_integration(self): """Test end-to-end article search through MCP.""" mock_result = json.dumps([ { "pmid": "12345", "title": "Test Article", "abstract": "Test abstract", } ]) with patch( "biomcp.articles.unified.search_articles_unified" ) as mock_search: mock_search.return_value = mock_result # Import search function directly since we can't test through MCP without Context from biomcp.router import search # Call the search function result = await search( query="", domain="article", genes="BRAF", page_size=10, ) # Verify the result structure assert "results" in result # May include thinking reminder as first result actual_results = [ r for r in result["results"] if r["id"] != "thinking-reminder" ] assert len(actual_results) == 1 assert actual_results[0]["id"] == "12345" async def test_mcp_fetch_variant_integration(self): """Test end-to-end variant fetch through MCP.""" mock_result = json.dumps([ { "_id": "rs121913529", "gene": {"symbol": "BRAF"}, "clinvar": {"clinical_significance": "Pathogenic"}, } ]) with patch("biomcp.variants.getter.get_variant") as mock_get: mock_get.return_value = mock_result from biomcp.router import fetch # Call the fetch function result = await fetch( domain="variant", id="rs121913529", ) # Verify the result structure assert result["id"] == "rs121913529" assert "title" in result assert "text" in result assert "url" in result assert "metadata" in result async def test_mcp_unified_query_integration(self): """Test unified query through MCP.""" with patch("biomcp.query_router.execute_routing_plan") as mock_execute: mock_execute.return_value = { "articles": json.dumps([ {"pmid": "111", "title": "Article 1"} ]), "variants": json.dumps([ {"_id": "rs222", "gene": {"symbol": "TP53"}} ]), } from biomcp.router import search # Call search with unified query result = await search( query="gene:BRAF AND disease:cancer", max_results_per_domain=10, ) # Should get results from multiple domains assert "results" in result # May include thinking reminder actual_results = [ r for r in result["results"] if r["id"] != "thinking-reminder" ] assert len(actual_results) >= 2 async def test_mcp_thinking_integration(self): """Test sequential thinking through MCP.""" with patch( "biomcp.thinking.sequential._sequential_thinking" ) as mock_think: mock_think.return_value = { "thought": "Processed thought", "analysis": "Test analysis", } from biomcp.thinking_tool import think # Call the think tool directly result = await think( thought="Test thought", thoughtNumber=1, totalThoughts=3, nextThoughtNeeded=True, ) # Verify thinking result assert result["domain"] == "thinking" assert result["thoughtNumber"] == 1 assert result["nextThoughtNeeded"] is True async def test_mcp_error_handling(self): """Test MCP error handling.""" from biomcp.exceptions import InvalidDomainError from biomcp.router import search # Test with invalid domain with pytest.raises(InvalidDomainError) as exc_info: await search( query="", domain="invalid_domain", ) assert "Unknown domain" in str(exc_info.value) async def test_mcp_fetch_all_trial_sections(self): """Test fetching trial with all sections through MCP.""" mock_protocol = {"title": "Test Trial", "nct_id": "NCT123"} mock_locations = {"locations": [{"city": "Boston"}]} with ( patch("biomcp.trials.getter._trial_protocol") as mock_p, patch("biomcp.trials.getter._trial_locations") as mock_l, patch("biomcp.trials.getter._trial_outcomes") as mock_o, patch("biomcp.trials.getter._trial_references") as mock_r, ): mock_p.return_value = json.dumps(mock_protocol) mock_l.return_value = json.dumps(mock_locations) mock_o.return_value = json.dumps({"outcomes": {}}) mock_r.return_value = json.dumps({"references": []}) from biomcp.router import fetch result = await fetch( domain="trial", id="NCT123", detail="all", ) # Verify all sections are included assert result["id"] == "NCT123" assert "locations" in result["metadata"] assert "outcomes" in result["metadata"] assert "references" in result["metadata"] async def test_mcp_parameter_parsing(self): """Test parameter parsing through MCP.""" mock_result = json.dumps([]) with patch( "biomcp.articles.unified.search_articles_unified" ) as mock_search: mock_search.return_value = mock_result from biomcp.router import search # Test with various parameter formats await search( query="", domain="article", genes='["BRAF", "KRAS"]', # JSON string diseases="cancer,melanoma", # Comma-separated keywords=["test1", "test2"], # Already a list ) # Verify parameters were parsed correctly call_args = mock_search.call_args[0][0] assert call_args.genes == ["BRAF", "KRAS"] assert call_args.diseases == ["cancer", "melanoma"] assert call_args.keywords == ["test1", "test2"] ``` -------------------------------------------------------------------------------- /tests/tdd/test_biothings_integration_real.py: -------------------------------------------------------------------------------- ```python """Integration tests for BioThings API - calls real APIs.""" import pytest from biomcp.integrations import BioThingsClient @pytest.mark.integration class TestRealBioThingsAPIs: """Integration tests that call real BioThings APIs.""" @pytest.fixture def client(self): """Create a real BioThings client.""" return BioThingsClient() @pytest.mark.asyncio async def test_mygene_tp53(self, client): """Test real MyGene.info API with TP53.""" result = await client.get_gene_info("TP53") assert result is not None assert result.symbol == "TP53" assert result.name == "tumor protein p53" assert result.entrezgene in ["7157", 7157] assert "tumor suppressor" in result.summary.lower() # Check for either lowercase or uppercase P53 in aliases assert any("p53" in alias.lower() for alias in result.alias) @pytest.mark.asyncio async def test_mygene_braf(self, client): """Test real MyGene.info API with BRAF.""" result = await client.get_gene_info("BRAF") assert result is not None assert result.symbol == "BRAF" assert "proto-oncogene" in result.name.lower() assert result.type_of_gene == "protein-coding" @pytest.mark.asyncio async def test_mygene_by_entrez_id(self, client): """Test real MyGene.info API with Entrez ID.""" result = await client.get_gene_info("673") # BRAF assert result is not None assert result.symbol == "BRAF" assert result.gene_id == "673" @pytest.mark.asyncio async def test_mydisease_melanoma(self, client): """Test real MyDisease.info API with melanoma.""" result = await client.get_disease_info("melanoma") if result is None: # API might be down or melanoma might not be found directly # Try a more specific search result = await client.get_disease_info( "MONDO:0005105" ) # MONDO ID for melanoma assert result is not None, "Disease info should be returned" # The API may return subtypes of melanoma if result.name: assert "melanoma" in result.name.lower() or ( result.definition and "melanoma" in result.definition.lower() ) assert result.disease_id is not None # Synonyms might be empty for specific subtypes assert result.synonyms is not None @pytest.mark.asyncio async def test_mydisease_gist(self, client): """Test real MyDisease.info API with GIST.""" result = await client.get_disease_info("GIST") if result is None: # API might be down or GIST might not be found directly # Try the full name result = await client.get_disease_info( "gastrointestinal stromal tumor" ) assert result is not None, "Disease info should be returned" # GIST might return as a variant name if result.name: assert ( "gist" in result.name.lower() or "stromal" in result.name.lower() ) assert result.disease_id is not None # GIST should have synonyms including full name if available assert result.synonyms is not None @pytest.mark.asyncio async def test_mydisease_by_mondo_id(self, client): """Test real MyDisease.info API with MONDO ID.""" result = await client.get_disease_info("MONDO:0005105") # melanoma assert result is not None assert result.disease_id == "MONDO:0005105" # The result should have mondo data assert result.mondo is not None assert result.mondo.get("mondo") == "MONDO:0005105" # Name field might come from different sources in the API if result.name: assert "melanoma" in result.name.lower() @pytest.mark.asyncio async def test_disease_synonyms_expansion(self, client): """Test disease synonym expansion.""" synonyms = await client.get_disease_synonyms("lung cancer") assert len(synonyms) >= 1 # At least includes the original term assert "lung cancer" in [s.lower() for s in synonyms] # May or may not include formal terms depending on API results # Just check we got some results back assert synonyms is not None and len(synonyms) > 0 @pytest.mark.asyncio async def test_batch_genes(self, client): """Test batch gene retrieval.""" # Test single gene retrieval as a workaround since batch requires special POST encoding # This validates the gene getter can handle multiple calls efficiently genes = ["TP53", "BRAF", "EGFR"] results = [] for gene in genes: result = await client.get_gene_info(gene) if result: results.append(result) assert len(results) == 3 gene_symbols = [r.symbol for r in results] assert "TP53" in gene_symbols assert "BRAF" in gene_symbols assert "EGFR" in gene_symbols @pytest.mark.asyncio async def test_invalid_gene(self, client): """Test handling of invalid gene.""" result = await client.get_gene_info("INVALID_GENE_XYZ123") assert result is None @pytest.mark.asyncio async def test_invalid_disease(self, client): """Test handling of invalid disease.""" result = await client.get_disease_info("INVALID_DISEASE_XYZ123") assert result is None @pytest.mark.asyncio async def test_mychem_aspirin(self, client): """Test real MyChem.info API with aspirin.""" # Use DrugBank ID for reliable results result = await client.get_drug_info("DB00945") assert result is not None # API returns various forms - could be aspirin or acetylsalicylic acid assert result.name is not None assert result.drugbank_id == "DB00945" # Should have at least one identifier assert any([ result.drugbank_id, result.chembl_id, result.chebi_id, result.pubchem_cid, ]) @pytest.mark.asyncio async def test_mychem_imatinib(self, client): """Test real MyChem.info API with imatinib.""" # Use DrugBank ID for reliable results result = await client.get_drug_info("DB00619") assert result is not None assert result.name is not None assert "imatinib" in result.name.lower() assert result.drugbank_id == "DB00619" # Should have at least one identifier assert any([ result.drugbank_id, result.chembl_id, result.chebi_id, result.pubchem_cid, ]) @pytest.mark.asyncio async def test_mychem_by_drugbank_id(self, client): """Test real MyChem.info API with DrugBank ID.""" result = await client.get_drug_info("DB00945") # Aspirin assert result is not None assert result.drugbank_id == "DB00945" assert ( result.name is not None ) # Could be Acetylsalicylic acid or similar @pytest.mark.asyncio async def test_invalid_drug(self, client): """Test handling of invalid drug.""" result = await client.get_drug_info("INVALID_DRUG_XYZ123") assert result is None @pytest.mark.asyncio async def test_mychem_pembrolizumab(self, client): """Test real MyChem.info API with pembrolizumab.""" result = await client.get_drug_info("pembrolizumab") assert result is not None assert result.name == "Pembrolizumab" assert result.drugbank_id == "DB09037" assert result.unii == "DPT0O3T46P" assert "PD-1" in result.description assert "antibody" in result.description.lower() @pytest.mark.integration class TestGeneToolIntegration: """Test the gene getter tool with real APIs.""" @pytest.mark.asyncio async def test_gene_getter_tool(self): """Test the gene_getter tool function.""" from biomcp.genes.getter import get_gene result = await get_gene("TP53", output_json=False) assert "TP53" in result assert "tumor protein p53" in result assert "tumor suppressor" in result.lower() # Links might be formatted differently assert "ncbi" in result.lower() or "gene" in result.lower() @pytest.mark.asyncio async def test_gene_getter_json(self): """Test gene_getter with JSON output.""" import json from biomcp.genes.getter import get_gene result = await get_gene("BRAF", output_json=True) data = json.loads(result) assert data["symbol"] == "BRAF" assert "_links" in data assert "NCBI Gene" in data["_links"] @pytest.mark.integration class TestDiseaseToolIntegration: """Test the disease getter tool with real APIs.""" @pytest.mark.asyncio async def test_disease_getter_tool(self): """Test the disease_getter tool function.""" from biomcp.diseases.getter import get_disease result = await get_disease("melanoma", output_json=False) assert "melanoma" in result.lower() assert "MONDO:" in result # In markdown format, links are shown as "MONDO Browser:" not "_links" assert "Browser:" in result or "https://" in result @pytest.mark.asyncio async def test_disease_getter_json(self): """Test disease_getter with JSON output.""" import json from biomcp.diseases.getter import get_disease result = await get_disease("GIST", output_json=True) data = json.loads(result) # API might return error or different structure if "error" in data: pytest.skip("Disease not found in API") else: # Check for key fields assert "disease_id" in data or "id" in data or "_id" in data assert "MONDO:" in str(data) @pytest.mark.integration class TestDrugToolIntegration: """Test the drug getter tool with real APIs.""" @pytest.mark.asyncio async def test_drug_getter_tool(self): """Test the drug_getter tool function.""" from biomcp.drugs.getter import get_drug result = await get_drug("DB00945", output_json=False) # Aspirin assert "Drug:" in result assert "DrugBank ID" in result assert "DB00945" in result assert "External Links" in result @pytest.mark.asyncio async def test_drug_getter_json(self): """Test drug_getter with JSON output.""" import json from biomcp.drugs.getter import get_drug result = await get_drug("DB00619", output_json=True) # Imatinib data = json.loads(result) # Check for basic fields assert "drug_id" in data assert "drugbank_id" in data assert data["drugbank_id"] == "DB00619" assert "_links" in data # Should have at least one database link assert any( key in data["_links"] for key in ["DrugBank", "ChEMBL", "PubChem", "ChEBI"] ) ```