This is page 2 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /src/biomcp/core.py: -------------------------------------------------------------------------------- ```python 1 | """Core module for BioMCP containing shared resources.""" 2 | 3 | from contextlib import asynccontextmanager 4 | from enum import Enum 5 | from typing import Any 6 | 7 | from mcp.server.fastmcp import FastMCP 8 | from mcp.server.fastmcp.utilities.logging import get_logger 9 | 10 | from .logging_filter import setup_logging_filters 11 | 12 | # Set up logger first 13 | logger = get_logger(__name__) 14 | 15 | # Set up logging filters to suppress non-critical ASGI errors 16 | setup_logging_filters() 17 | 18 | 19 | # Define a lifespan function for startup tasks 20 | @asynccontextmanager 21 | async def lifespan(mcp): 22 | """Lifespan context manager for startup/shutdown tasks.""" 23 | # Startup 24 | try: 25 | from .prefetch import start_prefetching 26 | 27 | await start_prefetching() 28 | except Exception as e: 29 | # Don't fail startup if prefetching fails 30 | logger.warning(f"Prefetching failed: {e}") 31 | 32 | yield 33 | 34 | # Shutdown (if needed) 35 | 36 | 37 | # Initialize the MCP app with lifespan 38 | # Note: stateless_http=True is needed for proper streamable HTTP support 39 | mcp_app = FastMCP( 40 | name="BioMCP - Biomedical Model Context Protocol Server", 41 | lifespan=lifespan, 42 | stateless_http=True, # Enable stateless HTTP for streamable transport 43 | ) 44 | 45 | 46 | class StrEnum(str, Enum): 47 | def __str__(self): 48 | return self.value 49 | 50 | @classmethod 51 | def _missing_(cls, value): 52 | if isinstance(value, str): 53 | for member in cls: 54 | if member.lower() == value.lower(): 55 | return member 56 | m = member.lower().replace(" ", "_") 57 | v = value.lower().replace(" ", "_") 58 | if m == v: 59 | return member 60 | return None 61 | 62 | 63 | class PublicationState(StrEnum): 64 | """Publication state of an article.""" 65 | 66 | PREPRINT = "preprint" 67 | PEER_REVIEWED = "peer_reviewed" 68 | UNKNOWN = "unknown" 69 | 70 | 71 | def ensure_list(value: Any, split_strings: bool = False) -> list[Any]: 72 | """ 73 | Convert a value to a list if it's not already. 74 | 75 | This is particularly useful for handling inputs from LLMs that might 76 | provide comma-separated strings instead of proper lists. 77 | 78 | Args: 79 | value: The value to convert to a list 80 | split_strings: If True, splits string values by comma and strips whitespace. 81 | If False, wraps the string in a list without splitting. 82 | 83 | Returns: 84 | A list containing the value(s) 85 | - If value is None, returns an empty list 86 | - If value is a string and split_strings is True, splits by comma and strips whitespace 87 | - If value is a string and split_strings is False, wraps it in a list 88 | - If value is already a list, returns it unchanged 89 | - For other types, wraps them in a list 90 | """ 91 | if value is None: 92 | return [] 93 | if isinstance(value, str) and split_strings: 94 | # Split by comma and strip whitespace 95 | return [item.strip() for item in value.split(",")] 96 | if isinstance(value, list): 97 | return value 98 | # For any other type, wrap it in a list 99 | return [value] 100 | 101 | 102 | # Set httpx logger to warn level only 103 | httpx_logger = get_logger("httpx") 104 | httpx_logger.setLevel("WARN") 105 | 106 | # Set main logger level 107 | logger.setLevel("INFO") 108 | ``` -------------------------------------------------------------------------------- /src/biomcp/utils/mutation_filter.py: -------------------------------------------------------------------------------- ```python 1 | """Mutation filtering utilities.""" 2 | 3 | import re 4 | from collections.abc import Sequence 5 | from typing import Protocol 6 | 7 | 8 | class MutationHitProtocol(Protocol): 9 | """Protocol for mutation hit objects.""" 10 | 11 | protein_change: str 12 | 13 | 14 | class MutationFilter: 15 | """Filter mutations based on specific mutation or pattern.""" 16 | 17 | def __init__( 18 | self, specific_mutation: str | None = None, pattern: str | None = None 19 | ): 20 | """Initialize the filter. 21 | 22 | Args: 23 | specific_mutation: Exact mutation to match (e.g., "V600E") 24 | pattern: Pattern to match (e.g., "V600*" for any V600 mutation) 25 | """ 26 | self.specific_mutation = specific_mutation 27 | self.pattern = pattern 28 | 29 | def matches(self, protein_change: str) -> bool: 30 | """Check if a protein change matches the filter criteria. 31 | 32 | Args: 33 | protein_change: The protein change to check 34 | 35 | Returns: 36 | True if matches, False otherwise 37 | """ 38 | if not protein_change: 39 | return False 40 | 41 | if self.specific_mutation: 42 | return protein_change == self.specific_mutation 43 | 44 | if self.pattern: 45 | return self._matches_pattern(protein_change) 46 | 47 | # No filter specified, match all 48 | return True 49 | 50 | def _matches_pattern(self, protein_change: str) -> bool: 51 | """Check if protein change matches pattern. 52 | 53 | Args: 54 | protein_change: The protein change to check 55 | 56 | Returns: 57 | True if matches pattern, False otherwise 58 | """ 59 | if not self.pattern: 60 | return False 61 | 62 | if self.pattern.endswith("*"): 63 | # Wildcard pattern (e.g., "V600*" matches "V600E", "V600K", etc.) 64 | prefix = self.pattern[:-1] 65 | return protein_change.startswith(prefix) 66 | 67 | # Try regex match 68 | try: 69 | # Escape special regex characters except * 70 | escaped_pattern = re.escape(self.pattern).replace(r"\*", ".*") 71 | return bool(re.match(f"^{escaped_pattern}$", protein_change)) 72 | except re.error: 73 | # Fallback to simple prefix match 74 | return protein_change.startswith(self.pattern) 75 | 76 | def filter_mutations( 77 | self, mutations: Sequence[MutationHitProtocol] 78 | ) -> list[MutationHitProtocol]: 79 | """Filter a list of mutations. 80 | 81 | Args: 82 | mutations: List of mutation objects with protein_change attribute 83 | 84 | Returns: 85 | Filtered list of mutations 86 | """ 87 | if not self.specific_mutation and not self.pattern: 88 | return list(mutations) 89 | 90 | return [mut for mut in mutations if self.matches(mut.protein_change)] 91 | 92 | def __str__(self) -> str: 93 | """String representation of the filter.""" 94 | if self.specific_mutation: 95 | return f"MutationFilter(specific={self.specific_mutation})" 96 | elif self.pattern: 97 | return f"MutationFilter(pattern={self.pattern})" 98 | else: 99 | return "MutationFilter(no_filter)" 100 | 101 | def __repr__(self) -> str: 102 | """Detailed representation of the filter.""" 103 | return f"MutationFilter(specific_mutation={self.specific_mutation!r}, pattern={self.pattern!r})" 104 | ``` -------------------------------------------------------------------------------- /docs/apis/overview.md: -------------------------------------------------------------------------------- ```markdown 1 | # API Reference Overview 2 | 3 | BioMCP provides multiple interfaces for programmatic access to biomedical data. This reference covers the Python SDK, MCP protocol implementation, and HTTP API endpoints. 4 | 5 | ## Available APIs 6 | 7 | ### 1. Python SDK 8 | 9 | The Python SDK provides async/await interfaces for all BioMCP functionality: 10 | 11 | - **Client API**: High-level client for all domains 12 | - **Domain-specific APIs**: Specialized interfaces for articles, trials, variants 13 | - **Streaming API**: For real-time data processing 14 | - **Batch API**: For bulk operations 15 | 16 | See [Python SDK Reference](python-sdk.md) for detailed documentation. 17 | 18 | ### 2. MCP Protocol 19 | 20 | BioMCP implements the Model Context Protocol for AI assistant integration: 21 | 22 | - **24 specialized tools** for biomedical research 23 | - **Unified search** across all domains 24 | - **Sequential thinking** for complex queries 25 | - **Streaming responses** for large datasets 26 | 27 | See [MCP Tools Reference](../user-guides/02-mcp-tools-reference.md) for implementation details. 28 | 29 | ### 3. HTTP REST API 30 | 31 | When running in HTTP mode, BioMCP exposes RESTful endpoints: 32 | 33 | - **Search endpoints** for each domain 34 | - **Fetch endpoints** for detailed records 35 | - **Health monitoring** endpoints 36 | - **WebSocket support** for streaming 37 | 38 | See [Transport Protocol Guide](../developer-guides/04-transport-protocol.md) for endpoint documentation. 39 | 40 | ## Common Patterns 41 | 42 | ### Authentication 43 | 44 | Most endpoints work without authentication. API keys enable enhanced features: 45 | 46 | ```python 47 | # Python SDK 48 | client = BioMCPClient( 49 | nci_api_key="your-key", 50 | alphagenome_api_key="your-key" 51 | ) 52 | 53 | # HTTP API 54 | headers = { 55 | "X-NCI-API-Key": "your-key", 56 | "X-AlphaGenome-API-Key": "your-key" 57 | } 58 | ``` 59 | 60 | ### Error Handling 61 | 62 | All APIs use consistent error codes: 63 | 64 | | Code | Meaning | Action | 65 | | ---- | ------------ | ------------------ | 66 | | 400 | Bad Request | Check parameters | 67 | | 401 | Unauthorized | Check API key | 68 | | 404 | Not Found | Verify ID exists | 69 | | 429 | Rate Limited | Retry with backoff | 70 | | 500 | Server Error | Retry later | 71 | 72 | ### Pagination 73 | 74 | Standard pagination across all APIs: 75 | 76 | ```python 77 | # Python SDK 78 | results = await client.search( 79 | domain="article", 80 | page=1, 81 | page_size=20 82 | ) 83 | 84 | # HTTP API 85 | GET /api/articles?page=1&page_size=20 86 | ``` 87 | 88 | ### Response Formats 89 | 90 | All APIs support multiple response formats: 91 | 92 | - **JSON**: Default, structured data 93 | - **JSONL**: Streaming line-delimited JSON 94 | - **Markdown**: Human-readable formatting 95 | - **CSV**: Tabular data export 96 | 97 | ## Rate Limits 98 | 99 | | API | Without Key | With Key | 100 | | ------------------ | ----------- | ------------ | 101 | | PubMed/PubTator3 | 3 req/sec | 10 req/sec | 102 | | ClinicalTrials.gov | 50 req/min | 50 req/min | 103 | | BioThings | 3 req/sec | 10 req/sec | 104 | | NCI | N/A | 1000 req/day | 105 | | AlphaGenome | N/A | 100 req/day | 106 | 107 | ## Next Steps 108 | 109 | - [Python SDK Reference](python-sdk.md) - Detailed Python API documentation 110 | - [MCP Tools Reference](../user-guides/02-mcp-tools-reference.md) - MCP implementation details 111 | - [Transport Protocol Guide](../developer-guides/04-transport-protocol.md) - REST endpoint documentation 112 | - [Error Codes Reference](error-codes.md) - Complete error code listing 113 | ``` -------------------------------------------------------------------------------- /example_scripts/python_sdk.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env -S uv --quiet run --script 2 | # /// script 3 | # requires-python = ">=3.11" 4 | # dependencies = [ 5 | # "biomcp-python", 6 | # ] 7 | # /// 8 | 9 | # Scripts to reproduce this page: 10 | # https://biomcp.org/python_sdk/ 11 | 12 | import asyncio 13 | import json 14 | 15 | from biomcp.trials.search import ( 16 | RecruitingStatus, 17 | TrialPhase, 18 | TrialQuery, 19 | search_trials, 20 | ) 21 | from biomcp.variants.getter import get_variant 22 | from biomcp.variants.search import VariantQuery, search_variants 23 | 24 | 25 | async def find_pathogenic_tp53(): 26 | # noinspection PyTypeChecker 27 | query = VariantQuery(gene="TP53", significance="pathogenic", size=5) 28 | # Get results as Markdown (default) 29 | json_output_str = await search_variants(query, output_json=True) 30 | data = json.loads(json_output_str) 31 | assert len(data) == 5 32 | for item in data: 33 | clinvar = item.get("clinvar") 34 | for rcv in clinvar.get("rcv", []): 35 | assert "pathogenic" in rcv["clinical_significance"].lower() 36 | 37 | 38 | async def get_braf_v600e_details(): 39 | variant_id = "chr7:g.140453136A>T" # BRAF V600E variant 40 | 41 | # Get results as JSON string 42 | json_output_str = await get_variant(variant_id, output_json=True) 43 | data = json.loads(json_output_str) 44 | 45 | # Process the variant data 46 | assert data, "No data returned for BRAF V600E variant" 47 | variant = data[0] 48 | clinvar = variant.get("clinvar", {}) 49 | cosmic = variant.get("cosmic", {}) 50 | docm = variant.get("docm", {}) 51 | 52 | # Verify key variant details 53 | assert clinvar.get("gene", {}).get("symbol") == "BRAF" 54 | assert clinvar.get("chrom") == "7" 55 | assert clinvar.get("cytogenic") == "7q34" 56 | assert cosmic.get("cosmic_id") == "COSM476" 57 | assert docm.get("aa_change") == "p.V600E" 58 | 59 | # Verify HGVS coding variants 60 | hgvs_coding = clinvar.get("hgvs", {}).get("coding", []) 61 | assert len(hgvs_coding) >= 13 62 | assert "NM_004333.6:c.1799T>A" in hgvs_coding 63 | 64 | 65 | async def find_melanoma_trials(): 66 | query = TrialQuery( 67 | conditions=["Melanoma"], 68 | interventions=["Pembrolizumab"], 69 | recruiting_status=RecruitingStatus.OPEN, 70 | phase=TrialPhase.PHASE3, 71 | ) 72 | 73 | # Get results as JSON string 74 | json_output_str = await search_trials(query, output_json=True) 75 | data = json.loads(json_output_str) 76 | 77 | # Verify we got results 78 | assert data, "No trials found" 79 | assert len(data) >= 2, "Expected at least 2 melanoma trials" 80 | 81 | # Verify first trial details (NCT05727904) 82 | trial1 = data[0] 83 | assert trial1["NCT Number"] == "NCT05727904" 84 | assert "lifileucel" in trial1["Study Title"].lower() 85 | assert trial1["Study Status"] == "RECRUITING" 86 | assert trial1["Phases"] == "PHASE3" 87 | assert int(trial1["Enrollment"]) == 670 88 | assert "Melanoma" in trial1["Conditions"] 89 | assert "Pembrolizumab" in trial1["Interventions"] 90 | 91 | # Verify second trial details (NCT06697301) 92 | trial2 = data[1] 93 | assert trial2["NCT Number"] == "NCT06697301" 94 | assert "EIK1001" in trial2["Study Title"] 95 | assert trial2["Study Status"] == "RECRUITING" 96 | assert "PHASE3" in trial2["Phases"] 97 | assert int(trial2["Enrollment"]) == 740 98 | assert trial2["Conditions"] == "Advanced Melanoma" 99 | 100 | 101 | def run(): 102 | asyncio.run(find_pathogenic_tp53()) 103 | asyncio.run(get_braf_v600e_details()) 104 | asyncio.run(find_melanoma_trials()) 105 | 106 | 107 | if __name__ == "__main__": 108 | run() 109 | ``` -------------------------------------------------------------------------------- /src/biomcp/genes/getter.py: -------------------------------------------------------------------------------- ```python 1 | """Gene information retrieval from MyGene.info.""" 2 | 3 | import json 4 | import logging 5 | from typing import Annotated 6 | 7 | from pydantic import Field 8 | 9 | from ..integrations import BioThingsClient 10 | from ..render import to_markdown 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | async def get_gene( 16 | gene_id_or_symbol: str, 17 | output_json: bool = False, 18 | ) -> str: 19 | """ 20 | Get gene information from MyGene.info. 21 | 22 | Args: 23 | gene_id_or_symbol: Gene ID (Entrez, Ensembl) or symbol (e.g., "TP53", "7157") 24 | output_json: Return as JSON instead of markdown 25 | 26 | Returns: 27 | Gene information as markdown or JSON string 28 | """ 29 | client = BioThingsClient() 30 | 31 | try: 32 | gene_info = await client.get_gene_info(gene_id_or_symbol) 33 | 34 | if not gene_info: 35 | error_data = { 36 | "error": f"Gene '{gene_id_or_symbol}' not found", 37 | "suggestion": "Please check the gene symbol or ID", 38 | } 39 | return ( 40 | json.dumps(error_data, indent=2) 41 | if output_json 42 | else to_markdown([error_data]) 43 | ) 44 | 45 | # Convert to dict for rendering 46 | result = gene_info.model_dump(exclude_none=True) 47 | 48 | # Add helpful links 49 | if gene_info.entrezgene: 50 | result["_links"] = { 51 | "NCBI Gene": f"https://www.ncbi.nlm.nih.gov/gene/{gene_info.entrezgene}", 52 | "PubMed": f"https://pubmed.ncbi.nlm.nih.gov/?term={gene_info.symbol}", 53 | } 54 | 55 | # Format aliases nicely 56 | if gene_info.alias: 57 | result["alias"] = ", ".join( 58 | gene_info.alias[:10] 59 | ) # Limit to first 10 60 | if len(gene_info.alias) > 10: 61 | result["alias"] += f" (and {len(gene_info.alias) - 10} more)" 62 | 63 | if output_json: 64 | return json.dumps(result, indent=2) 65 | else: 66 | return to_markdown([result]) 67 | 68 | except Exception as e: 69 | logger.error(f"Error fetching gene info for {gene_id_or_symbol}: {e}") 70 | error_data = { 71 | "error": "Failed to retrieve gene information", 72 | "details": str(e), 73 | } 74 | return ( 75 | json.dumps(error_data, indent=2) 76 | if output_json 77 | else to_markdown([error_data]) 78 | ) 79 | 80 | 81 | async def _gene_details( 82 | call_benefit: Annotated[ 83 | str, 84 | "Define and summarize why this function is being called and the intended benefit", 85 | ], 86 | gene_id_or_symbol: Annotated[ 87 | str, 88 | Field(description="Gene symbol (e.g., TP53, BRAF) or ID (e.g., 7157)"), 89 | ], 90 | ) -> str: 91 | """ 92 | Retrieves detailed information for a single gene from MyGene.info. 93 | 94 | This tool provides real-time gene annotations including: 95 | - Official gene name and symbol 96 | - Gene summary/description 97 | - Aliases and alternative names 98 | - Gene type (protein-coding, etc.) 99 | - Links to external databases 100 | 101 | Parameters: 102 | - call_benefit: Define why this function is being called 103 | - gene_id_or_symbol: Gene symbol (e.g., "TP53") or Entrez ID (e.g., "7157") 104 | 105 | Process: Queries MyGene.info API for up-to-date gene annotations 106 | Output: Markdown formatted gene information with description and metadata 107 | 108 | Note: For variant information, use variant_searcher. For articles about genes, use article_searcher. 109 | """ 110 | return await get_gene(gene_id_or_symbol, output_json=False) 111 | ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/drug_recalls_helpers.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Helper functions for drug recall search to reduce complexity. 3 | """ 4 | 5 | 6 | def build_drug_search_query(drug: str) -> str: 7 | """Build search query for drug name.""" 8 | return ( 9 | f'(openfda.brand_name:"{drug}" OR ' 10 | f'openfda.generic_name:"{drug}" OR ' 11 | f'product_description:"{drug}")' 12 | ) 13 | 14 | 15 | def build_class_search_query(recall_class: str) -> str | None: 16 | """Build search query for recall classification.""" 17 | # Handle various input formats 18 | recall_class = recall_class.strip() 19 | 20 | # If already in "Class X" format, use it directly 21 | if recall_class.upper().startswith("CLASS "): 22 | return f'classification:"{recall_class.title()}"' 23 | 24 | # Map single digits/numerals to Class format 25 | class_map = { 26 | "1": "Class I", 27 | "I": "Class I", 28 | "2": "Class II", 29 | "II": "Class II", 30 | "3": "Class III", 31 | "III": "Class III", 32 | } 33 | if mapped_class := class_map.get(recall_class.upper()): 34 | return f'classification:"{mapped_class}"' 35 | return None 36 | 37 | 38 | def build_status_search_query(status: str) -> str | None: 39 | """Build search query for recall status.""" 40 | status_lower = status.lower() 41 | if status_lower in ["ongoing", "completed", "terminated"]: 42 | return f'status:"{status_lower.capitalize()}"' 43 | return None 44 | 45 | 46 | def build_date_search_query(since_date: str) -> str | None: 47 | """Build search query for date range.""" 48 | if len(since_date) == 8: 49 | formatted_date = f"{since_date[:4]}-{since_date[4:6]}-{since_date[6:]}" 50 | return f"recall_initiation_date:[{formatted_date} TO *]" 51 | return None 52 | 53 | 54 | def format_recall_search_header( 55 | drug: str | None, 56 | recall_class: str | None, 57 | status: str | None, 58 | since_date: str | None, 59 | total: int, 60 | ) -> list[str]: 61 | """Format header for recall search results.""" 62 | output = [] 63 | 64 | if drug: 65 | output.append(f"**Drug**: {drug}") 66 | if recall_class: 67 | output.append(f"**Classification**: Class {recall_class}") 68 | if status: 69 | output.append(f"**Status**: {status}") 70 | if since_date: 71 | output.append(f"**Since**: {since_date}") 72 | 73 | return output 74 | 75 | 76 | def build_recall_search_params( 77 | drug: str | None, 78 | recall_class: str | None, 79 | status: str | None, 80 | reason: str | None, 81 | since_date: str | None, 82 | limit: int, 83 | skip: int, 84 | ) -> dict: 85 | """Build search parameters for recall API.""" 86 | # Build search query 87 | search_parts = [] 88 | 89 | # Default to human drugs only (exclude veterinary) 90 | search_parts.append('product_type:"Human"') 91 | 92 | if drug: 93 | search_parts.append(build_drug_search_query(drug)) 94 | 95 | if recall_class and ( 96 | class_query := build_class_search_query(recall_class) 97 | ): 98 | search_parts.append(class_query) 99 | 100 | if status and (status_query := build_status_search_query(status)): 101 | search_parts.append(status_query) 102 | 103 | if reason: 104 | search_parts.append(f'reason_for_recall:"{reason}"') 105 | 106 | if since_date and (date_query := build_date_search_query(since_date)): 107 | search_parts.append(date_query) 108 | 109 | # Combine search parts 110 | search_params = {} 111 | if search_parts: 112 | search_params["search"] = " AND ".join(search_parts) 113 | 114 | # Add pagination 115 | search_params["limit"] = str(min(limit, 100)) 116 | search_params["skip"] = str(skip) 117 | 118 | # Sort by recall date (most recent first) 119 | search_params["sort"] = "recall_initiation_date:desc" 120 | 121 | return search_params 122 | ``` -------------------------------------------------------------------------------- /src/biomcp/shared_context.py: -------------------------------------------------------------------------------- ```python 1 | """Shared context for search operations to avoid redundant validations. 2 | 3 | This module provides a context manager that maintains validated entities 4 | (genes, diseases, chemicals) across multiple search operations to improve 5 | performance by eliminating redundant API calls. 6 | 7 | Example: 8 | ```python 9 | from biomcp.shared_context import SearchContextManager 10 | 11 | with SearchContextManager() as context: 12 | # First validation hits the API 13 | is_valid = await context.validate_gene("BRAF") 14 | 15 | # Subsequent validation uses cache 16 | is_valid_again = await context.validate_gene("BRAF") 17 | ``` 18 | """ 19 | 20 | from typing import Any 21 | 22 | 23 | class SearchContext: 24 | """Shared context to avoid redundant operations across searches. 25 | 26 | This class maintains a cache of validated entities to prevent 27 | redundant API calls during a search session. 28 | 29 | Attributes: 30 | validated_genes: Cache of gene validation results 31 | validated_cache: General validation cache for other entities 32 | """ 33 | 34 | def __init__(self): 35 | self.validated_genes: dict[str, bool] = {} 36 | self.gene_summaries: dict[str, Any] = {} 37 | self.cancer_types: dict[str, Any] | None = None 38 | self._validation_cache: dict[str, Any] = {} 39 | 40 | async def validate_gene(self, gene: str) -> bool: 41 | """Validate gene symbol with caching.""" 42 | if gene in self.validated_genes: 43 | return self.validated_genes[gene] 44 | 45 | # Import here to avoid circular imports 46 | from .utils.gene_validator import is_valid_gene_symbol 47 | 48 | is_valid = is_valid_gene_symbol(gene) 49 | self.validated_genes[gene] = is_valid 50 | return is_valid 51 | 52 | def get_gene_summary(self, gene: str) -> Any | None: 53 | """Get cached gene summary if available.""" 54 | return self.gene_summaries.get(gene) 55 | 56 | def set_gene_summary(self, gene: str, summary: Any): 57 | """Cache gene summary.""" 58 | self.gene_summaries[gene] = summary 59 | 60 | def cache_validation(self, key: str, value: Any): 61 | """Cache arbitrary validation results.""" 62 | self._validation_cache[key] = value 63 | 64 | def get_cached_validation(self, key: str) -> Any | None: 65 | """Get cached validation result.""" 66 | return self._validation_cache.get(key) 67 | 68 | 69 | # Thread-local context for current search operation 70 | _search_context: SearchContext | None = None 71 | 72 | 73 | def get_search_context() -> SearchContext | None: 74 | """Get the current search context.""" 75 | return _search_context 76 | 77 | 78 | def set_search_context(context: SearchContext | None): 79 | """Set the current search context.""" 80 | global _search_context 81 | _search_context = context 82 | 83 | 84 | class SearchContextManager: 85 | """Context manager for search operations.""" 86 | 87 | _instance = None 88 | 89 | def __init__(self): 90 | self.context = None 91 | self.previous_context = None 92 | 93 | def __enter__(self): 94 | # Use singleton pattern within context 95 | if SearchContextManager._instance is None: 96 | SearchContextManager._instance = SearchContext() 97 | self.context = SearchContextManager._instance 98 | self.previous_context = get_search_context() 99 | set_search_context(self.context) 100 | return self.context 101 | 102 | def __exit__(self, exc_type, exc_val, exc_tb): 103 | set_search_context(self.previous_context) 104 | # Clear singleton when last context exits 105 | if self.previous_context is None: 106 | SearchContextManager._instance = None 107 | return False 108 | ``` -------------------------------------------------------------------------------- /src/biomcp/utils/request_cache.py: -------------------------------------------------------------------------------- ```python 1 | """Simple request-level caching for API calls.""" 2 | 3 | import asyncio 4 | import time 5 | from collections import OrderedDict 6 | from collections.abc import Awaitable, Callable 7 | from functools import wraps 8 | from typing import Any, TypeVar 9 | 10 | 11 | # LRU cache with size limit 12 | class LRUCache: 13 | """Simple LRU cache with TTL support.""" 14 | 15 | def __init__(self, max_size: int = 1000): 16 | self.cache: OrderedDict[str, tuple[Any, float]] = OrderedDict() 17 | self.max_size = max_size 18 | self._lock = asyncio.Lock() 19 | 20 | async def get(self, key: str) -> Any | None: 21 | """Get item from cache if not expired.""" 22 | async with self._lock: 23 | if key not in self.cache: 24 | return None 25 | 26 | value, expiry = self.cache[key] 27 | if time.time() > expiry: 28 | del self.cache[key] 29 | return None 30 | 31 | # Move to end (most recently used) 32 | self.cache.move_to_end(key) 33 | return value 34 | 35 | async def set(self, key: str, value: Any, ttl: float): 36 | """Set item in cache with TTL.""" 37 | async with self._lock: 38 | # Remove oldest items if at capacity 39 | while len(self.cache) >= self.max_size: 40 | self.cache.popitem(last=False) 41 | 42 | expiry = time.time() + ttl 43 | self.cache[key] = (value, expiry) 44 | 45 | 46 | # Global LRU cache instance 47 | _cache = LRUCache(max_size=1000) 48 | 49 | # Default TTL in seconds (15 minutes) 50 | DEFAULT_TTL = 900 51 | 52 | # Named caches for different purposes 53 | _named_caches: dict[str, LRUCache] = {} 54 | 55 | 56 | def get_cache( 57 | name: str, ttl_seconds: int = 300, max_size: int = 100 58 | ) -> LRUCache: 59 | """Get or create a named cache.""" 60 | if name not in _named_caches: 61 | _named_caches[name] = LRUCache(max_size=max_size) 62 | return _named_caches[name] 63 | 64 | 65 | T = TypeVar("T") 66 | 67 | 68 | def cache_key(*args, **kwargs) -> str: 69 | """Generate a cache key from function arguments.""" 70 | key_parts = [str(arg) for arg in args] 71 | key_parts.extend(f"{k}={v}" for k, v in sorted(kwargs.items())) 72 | return ":".join(key_parts) 73 | 74 | 75 | async def get_cached(key: str) -> Any | None: 76 | """Get a value from cache if not expired.""" 77 | return await _cache.get(key) 78 | 79 | 80 | async def set_cached(key: str, value: Any, ttl: int = DEFAULT_TTL) -> None: 81 | """Set a value in cache with TTL.""" 82 | await _cache.set(key, value, ttl) 83 | 84 | 85 | def request_cache(ttl: int = DEFAULT_TTL) -> Callable: 86 | """Decorator for caching async function results. 87 | 88 | Args: 89 | ttl: Time to live in seconds 90 | 91 | Returns: 92 | Decorated function with caching 93 | """ 94 | 95 | def decorator( 96 | func: Callable[..., Awaitable[T]], 97 | ) -> Callable[..., Awaitable[T]]: 98 | @wraps(func) 99 | async def wrapper(*args, **kwargs) -> T: 100 | # Skip caching if explicitly disabled 101 | if kwargs.pop("skip_cache", False): 102 | return await func(*args, **kwargs) 103 | 104 | # Generate cache key 105 | key = f"{func.__module__}.{func.__name__}:{cache_key(*args, **kwargs)}" 106 | 107 | # Check cache 108 | cached_value = await get_cached(key) 109 | if cached_value is not None: 110 | return cached_value 111 | 112 | # Call function and cache result 113 | result = await func(*args, **kwargs) 114 | if result is not None: # Only cache non-None results 115 | await set_cached(key, result, ttl) 116 | 117 | return result 118 | 119 | return wrapper 120 | 121 | return decorator 122 | 123 | 124 | async def clear_cache() -> None: 125 | """Clear all cached entries.""" 126 | # Use the LRU cache's clear method 127 | _cache.cache.clear() 128 | ``` -------------------------------------------------------------------------------- /src/biomcp/utils/cbio_http_adapter.py: -------------------------------------------------------------------------------- ```python 1 | """Adapter for using centralized HTTP client with cBioPortal API. 2 | 3 | This module provides a thin wrapper around the centralized HTTP client 4 | specifically for cBioPortal API calls. It handles: 5 | - Authorization header injection for authenticated requests 6 | - Consistent error handling and response formatting 7 | - Endpoint-specific caching and rate limiting 8 | - Seamless migration from direct httpx usage 9 | 10 | Example: 11 | adapter = CBioHTTPAdapter() 12 | data, error = await adapter.get("/genes/BRAF") 13 | if error: 14 | print(f"Failed to fetch gene: {error}") 15 | else: 16 | print(f"Gene ID: {data.get('entrezGeneId')}") 17 | """ 18 | 19 | import json 20 | from typing import Any 21 | 22 | from ..http_client import RequestError, request_api 23 | from ..variants.constants import CBIO_BASE_URL, CBIO_TOKEN 24 | 25 | 26 | class CBioHTTPAdapter: 27 | """Adapter for cBioPortal API calls using centralized HTTP client.""" 28 | 29 | def __init__(self): 30 | self.base_url = CBIO_BASE_URL 31 | self.headers = self._build_headers() 32 | 33 | def _build_headers(self) -> dict[str, str]: 34 | """Build authorization headers if token is available.""" 35 | headers = {} 36 | if CBIO_TOKEN: 37 | if not CBIO_TOKEN.startswith("Bearer "): 38 | headers["Authorization"] = f"Bearer {CBIO_TOKEN}" 39 | else: 40 | headers["Authorization"] = CBIO_TOKEN 41 | return headers 42 | 43 | async def get( 44 | self, 45 | path: str, 46 | params: dict[str, Any] | None = None, 47 | endpoint_key: str = "cbioportal_api", 48 | cache_ttl: int = 900, # 15 minutes default 49 | ) -> tuple[dict[str, Any] | None, RequestError | None]: 50 | """Make a GET request to cBioPortal API. 51 | 52 | Args: 53 | path: API path (e.g., "/genes/BRAF") 54 | params: Query parameters 55 | endpoint_key: Registry key for endpoint tracking 56 | cache_ttl: Cache time-to-live in seconds 57 | 58 | Returns: 59 | Tuple of (response_data, error) 60 | """ 61 | url = f"{self.base_url}{path}" 62 | 63 | # Prepare request with headers 64 | request_params = params or {} 65 | if self.headers: 66 | # Need to pass headers through params for centralized client 67 | request_params["_headers"] = json.dumps(self.headers) 68 | 69 | result, error = await request_api( 70 | url=url, 71 | request=request_params, 72 | method="GET", 73 | domain="cbioportal", # For rate limiting 74 | endpoint_key=endpoint_key, 75 | cache_ttl=cache_ttl, 76 | enable_retry=True, 77 | ) 78 | 79 | return result, error 80 | 81 | async def post( 82 | self, 83 | path: str, 84 | data: dict[str, Any], 85 | endpoint_key: str = "cbioportal_api", 86 | cache_ttl: int = 0, # No caching for POST by default 87 | ) -> tuple[dict[str, Any] | None, RequestError | None]: 88 | """Make a POST request to cBioPortal API. 89 | 90 | Args: 91 | path: API path 92 | data: Request body data 93 | endpoint_key: Registry key for endpoint tracking 94 | cache_ttl: Cache time-to-live in seconds 95 | 96 | Returns: 97 | Tuple of (response_data, error) 98 | """ 99 | url = f"{self.base_url}{path}" 100 | 101 | # Add headers to request 102 | if self.headers: 103 | data["_headers"] = json.dumps(self.headers) 104 | 105 | result, error = await request_api( 106 | url=url, 107 | request=data, 108 | method="POST", 109 | domain="cbioportal", 110 | endpoint_key=endpoint_key, 111 | cache_ttl=cache_ttl, 112 | enable_retry=True, 113 | ) 114 | 115 | return result, error 116 | ``` -------------------------------------------------------------------------------- /tests/tdd/utils/test_gene_validator.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for gene validation utilities.""" 2 | 3 | from biomcp.utils.gene_validator import ( 4 | is_valid_gene_symbol, 5 | sanitize_gene_symbol, 6 | ) 7 | 8 | 9 | class TestGeneValidator: 10 | """Test gene symbol validation.""" 11 | 12 | def test_valid_gene_symbols(self): 13 | """Test that valid gene symbols are accepted.""" 14 | valid_genes = [ 15 | "BRAF", 16 | "TP53", 17 | "KRAS", 18 | "EGFR", 19 | "PIK3CA", 20 | "BRCA1", 21 | "BRCA2", 22 | "MYC", 23 | "ERBB2", 24 | "CDKN2A", 25 | "VHL", 26 | "RB1", 27 | "PTEN", 28 | "APC", 29 | "MLH1", 30 | "MSH2", 31 | "MSH6", 32 | "PMS2", 33 | "ATM", 34 | "CHEK2", 35 | "PALB2", 36 | "RAD51C", 37 | "RAD51D", 38 | "BRIP1", 39 | "CDH1", 40 | "STK11", 41 | "MUTYH", 42 | "BMPR1A", 43 | "SMAD4", 44 | "ALK", 45 | "ROS1", 46 | "RET", 47 | "MET", 48 | "HER2", 49 | "FGFR1", 50 | "FGFR2", 51 | "FGFR3", 52 | "FGFR4", 53 | "IDH1", 54 | "IDH2", 55 | "TERT", 56 | "ATRX", 57 | "H3F3A", 58 | "HIST1H3B", 59 | "BRAFV600E", # With mutation 60 | "KRASG12D", # With mutation 61 | "EGFRL858R", # With mutation 62 | ] 63 | 64 | for gene in valid_genes: 65 | assert is_valid_gene_symbol( 66 | gene 67 | ), f"Should accept valid gene: {gene}" 68 | 69 | def test_invalid_gene_symbols(self): 70 | """Test that invalid gene symbols are rejected.""" 71 | invalid_genes = [ 72 | None, 73 | "", 74 | " ", 75 | " ", 76 | "123", # Starts with number 77 | "A", # Too short 78 | "INVALID_GENE_XYZ", # Known invalid 79 | "TEST", 80 | "NULL", 81 | "NONE", 82 | "UNKNOWN", 83 | "gene", # Lowercase 84 | "Braf", # Mixed case 85 | "GENE-WITH-SPECIAL-CHARS!", 86 | "GENE WITH SPACES", 87 | "GENE/WITH/SLASHES", 88 | "GENE.WITH.DOTS", 89 | "VERYLONGGENENAMETHATEXCEEDSLIMIT", # Too long 90 | "_GENE", # Starts with underscore 91 | "-GENE", # Starts with hyphen 92 | ] 93 | 94 | for gene in invalid_genes: 95 | assert not is_valid_gene_symbol( 96 | gene 97 | ), f"Should reject invalid gene: {gene}" 98 | 99 | def test_gene_symbols_with_version(self): 100 | """Test gene symbols with version suffixes.""" 101 | versioned_genes = [ 102 | "MT-CO1", 103 | "MT-CO2", 104 | "MT-CO3", 105 | "HLA-A", 106 | "HLA-B", 107 | "HLA-C", 108 | "HLA-DRB1", 109 | "HLA-DQB1", 110 | "HLA-DPB1", 111 | ] 112 | 113 | for gene in versioned_genes: 114 | assert is_valid_gene_symbol( 115 | gene 116 | ), f"Should accept versioned gene: {gene}" 117 | 118 | def test_sanitize_gene_symbol(self): 119 | """Test gene symbol sanitization.""" 120 | # Test uppercase conversion 121 | assert sanitize_gene_symbol("braf") == "BRAF" 122 | assert sanitize_gene_symbol("Tp53") == "TP53" 123 | assert sanitize_gene_symbol("kRaS") == "KRAS" 124 | 125 | # Test whitespace stripping 126 | assert sanitize_gene_symbol(" BRAF ") == "BRAF" 127 | assert sanitize_gene_symbol("\tTP53\n") == "TP53" 128 | assert sanitize_gene_symbol(" KRAS ") == "KRAS" 129 | 130 | # Test combination 131 | assert sanitize_gene_symbol(" braf ") == "BRAF" 132 | assert sanitize_gene_symbol("\ttp53\n") == "TP53" 133 | ``` -------------------------------------------------------------------------------- /src/biomcp/cli/server.py: -------------------------------------------------------------------------------- ```python 1 | from enum import Enum 2 | from typing import Annotated 3 | 4 | import typer 5 | from dotenv import load_dotenv 6 | 7 | from .. import logger, mcp_app # mcp_app is already instantiated in core.py 8 | 9 | # Load environment variables from .env file 10 | load_dotenv() 11 | 12 | server_app = typer.Typer(help="Server operations") 13 | 14 | 15 | class ServerMode(str, Enum): 16 | STDIO = "stdio" 17 | WORKER = "worker" 18 | STREAMABLE_HTTP = "streamable_http" 19 | 20 | 21 | def run_stdio_server(): 22 | """Run server in STDIO mode.""" 23 | logger.info("Starting MCP server with STDIO transport:") 24 | mcp_app.run(transport="stdio") 25 | 26 | 27 | def run_http_server(host: str, port: int, mode: ServerMode): 28 | """Run server in HTTP-based mode (worker or streamable_http).""" 29 | try: 30 | from typing import Any 31 | 32 | import uvicorn 33 | 34 | app: Any # Type will be either FastAPI or Starlette 35 | 36 | if mode == ServerMode.WORKER: 37 | logger.info("Starting MCP server with Worker/SSE transport") 38 | try: 39 | from ..workers.worker import app 40 | except ImportError as e: 41 | logger.error( 42 | f"Failed to import worker mode dependencies: {e}\n" 43 | "Please install with: pip install biomcp-python[worker]" 44 | ) 45 | raise typer.Exit(1) from e 46 | else: # STREAMABLE_HTTP 47 | logger.info( 48 | f"Starting MCP server with Streamable HTTP transport on {host}:{port}" 49 | ) 50 | logger.info(f"Endpoint: http://{host}:{port}/mcp") 51 | logger.info("Using FastMCP's native Streamable HTTP support") 52 | 53 | try: 54 | from starlette.responses import JSONResponse 55 | from starlette.routing import Route 56 | except ImportError as e: 57 | logger.error( 58 | f"Failed to import Starlette dependencies: {e}\n" 59 | "Please install with: pip install biomcp-python[worker]" 60 | ) 61 | raise typer.Exit(1) from e 62 | 63 | from .. import mcp_app 64 | 65 | # Get FastMCP's streamable_http_app 66 | app = mcp_app.streamable_http_app() 67 | 68 | # Add health endpoint to the Starlette app 69 | async def health_check(request): 70 | return JSONResponse({"status": "healthy"}) 71 | 72 | health_route = Route("/health", health_check, methods=["GET"]) 73 | app.routes.append(health_route) 74 | 75 | uvicorn.run( 76 | app, 77 | host=host, 78 | port=port, 79 | log_level="info", 80 | ) 81 | except ImportError as e: 82 | logger.error(f"Failed to start {mode.value} mode: {e}") 83 | raise typer.Exit(1) from e 84 | except Exception as e: 85 | logger.error(f"An unexpected error occurred: {e}", exc_info=True) 86 | raise typer.Exit(1) from e 87 | 88 | 89 | @server_app.command("run") 90 | def run_server( 91 | mode: Annotated[ 92 | ServerMode, 93 | typer.Option( 94 | help="Server mode: stdio (local), worker (legacy SSE), or streamable_http (MCP spec compliant)", 95 | case_sensitive=False, 96 | ), 97 | ] = ServerMode.STDIO, 98 | host: Annotated[ 99 | str, 100 | typer.Option( 101 | help="Host to bind to (for HTTP modes)", 102 | ), 103 | ] = "0.0.0.0", # noqa: S104 - Required for Docker container networking 104 | port: Annotated[ 105 | int, 106 | typer.Option( 107 | help="Port to bind to (for HTTP modes)", 108 | ), 109 | ] = 8000, 110 | ): 111 | """Run the BioMCP server with selected transport mode.""" 112 | if mode == ServerMode.STDIO: 113 | run_stdio_server() 114 | else: 115 | run_http_server(host, port, mode) 116 | ``` -------------------------------------------------------------------------------- /src/biomcp/thinking/sequential.py: -------------------------------------------------------------------------------- ```python 1 | """Sequential thinking module for BioMCP.""" 2 | 3 | from typing import Annotated 4 | 5 | from .session import ThoughtEntry, _session_manager 6 | 7 | 8 | def get_current_timestamp() -> str: 9 | """Get current timestamp in ISO format.""" 10 | from datetime import datetime 11 | 12 | return datetime.now().isoformat() 13 | 14 | 15 | async def _sequential_thinking( 16 | thought: Annotated[ 17 | str, "Current thinking step - be detailed and thorough" 18 | ], 19 | nextThoughtNeeded: Annotated[ 20 | bool, "True if more thinking needed, False only when completely done" 21 | ], 22 | thoughtNumber: Annotated[int, "Current thought number (start at 1)"], 23 | totalThoughts: Annotated[ 24 | int, "Best estimate of total thoughts (adjust as needed)" 25 | ], 26 | isRevision: Annotated[ 27 | bool, "True when correcting/improving a previous thought" 28 | ] = False, 29 | revisesThought: Annotated[ 30 | int | None, "The thought number being revised" 31 | ] = None, 32 | branchFromThought: Annotated[ 33 | int | None, "Create alternative path from this thought number" 34 | ] = None, 35 | needsMoreThoughts: Annotated[ 36 | bool | None, 37 | "True when problem is significantly larger than initially estimated", 38 | ] = None, 39 | ) -> str: 40 | """ 41 | ALWAYS use this tool for complex reasoning, analysis, or problem-solving. This facilitates a detailed, step-by-step thinking process that helps break down problems systematically. 42 | 43 | Use this tool when: 44 | - Analyzing complex problems or questions 45 | - Planning multi-step solutions 46 | - Breaking down tasks into components 47 | - Reasoning through uncertainties 48 | - Exploring alternative approaches 49 | 50 | Start with thoughtNumber=1 and totalThoughts as your best estimate. Set nextThoughtNeeded=true to continue thinking, or false when done. You can revise earlier thoughts or branch into alternative paths as needed. 51 | 52 | This is your primary reasoning tool - USE IT LIBERALLY for any non-trivial thinking task. 53 | """ 54 | 55 | # Validate inputs 56 | if thoughtNumber < 1: 57 | return "Error: thoughtNumber must be >= 1" 58 | 59 | if totalThoughts < 1: 60 | return "Error: totalThoughts must be >= 1" 61 | 62 | if isRevision and not revisesThought: 63 | return "Error: revisesThought must be specified when isRevision=True" 64 | 65 | # Get or create session 66 | session = _session_manager.get_or_create_session() 67 | 68 | # Create thought entry 69 | branch_id = f"branch_{branchFromThought}" if branchFromThought else None 70 | 71 | entry = ThoughtEntry( 72 | thought=thought, 73 | thought_number=thoughtNumber, 74 | total_thoughts=totalThoughts, 75 | next_thought_needed=nextThoughtNeeded, 76 | is_revision=isRevision, 77 | revises_thought=revisesThought, 78 | branch_from_thought=branchFromThought, 79 | branch_id=branch_id, 80 | metadata={"needsMoreThoughts": needsMoreThoughts} 81 | if needsMoreThoughts 82 | else {}, 83 | ) 84 | 85 | # Add thought to session 86 | session.add_thought(entry) 87 | 88 | # Generate status message 89 | if branchFromThought: 90 | status_msg = f"Added thought {thoughtNumber} to branch '{branch_id}'" 91 | elif isRevision and revisesThought: 92 | status_msg = ( 93 | f"Revised thought {revisesThought} (now thought {thoughtNumber})" 94 | ) 95 | else: 96 | status_msg = f"Added thought {thoughtNumber} to main sequence" 97 | 98 | # Generate progress information 99 | progress_msg = f"Progress: {thoughtNumber}/{totalThoughts} thoughts" 100 | next_msg = ( 101 | "Next thought needed" 102 | if nextThoughtNeeded 103 | else "Thinking sequence complete" 104 | ) 105 | 106 | return f"{status_msg}. {progress_msg}. {next_msg}." 107 | ``` -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- ```css 1 | /* Custom styles for BioMCP documentation */ 2 | 3 | /* Style for main navigation tabs */ 4 | .md-tabs__link { 5 | font-weight: 600; 6 | text-transform: uppercase; 7 | letter-spacing: 0.03em; 8 | } 9 | 10 | /* Bold section headers in sidebar */ 11 | .md-nav__item--section > .md-nav__link { 12 | font-weight: 700 !important; 13 | font-size: 0.9rem !important; 14 | margin-top: 0.8rem; 15 | margin-bottom: 0.4rem; 16 | padding-bottom: 0.4rem; 17 | border-bottom: 1px solid var(--md-default-fg-color--lightest); 18 | display: block; 19 | } 20 | 21 | /* Nested section headers - slightly smaller */ 22 | .md-nav__item--section .md-nav__item--section > .md-nav__link { 23 | font-weight: 600 !important; 24 | font-size: 0.85rem !important; 25 | margin-top: 0.4rem; 26 | margin-bottom: 0.2rem; 27 | } 28 | 29 | /* Regular navigation links */ 30 | .md-nav__link { 31 | font-weight: 400; 32 | } 33 | 34 | /* Active/current page link */ 35 | .md-nav__link--active { 36 | font-weight: 600 !important; 37 | color: var(--md-accent-fg-color) !important; 38 | } 39 | 40 | /* Table of contents header - make it lighter */ 41 | .md-nav--secondary > .md-nav__title { 42 | font-weight: 600 !important; 43 | font-size: 0.8rem !important; 44 | text-transform: none !important; 45 | letter-spacing: normal !important; 46 | color: var(--md-default-fg-color--light) !important; 47 | background-color: transparent !important; 48 | box-shadow: none !important; 49 | border-bottom: 1px solid var(--md-default-fg-color--lightest); 50 | padding-bottom: 0.4rem; 51 | } 52 | 53 | /* Add visual separation between major sections */ 54 | .md-nav--primary > .md-nav__list > .md-nav__item { 55 | margin-bottom: 0.5rem; 56 | } 57 | 58 | /* Improve readability of code blocks */ 59 | .highlight pre { 60 | line-height: 1.5; 61 | overflow-x: auto; 62 | white-space: pre; 63 | } 64 | 65 | /* Fix code blocks in grid cards */ 66 | .md-typeset .grid.cards code, 67 | .md-typeset .grid.cards pre { 68 | word-break: break-word; 69 | white-space: pre-wrap; 70 | overflow-wrap: break-word; 71 | } 72 | 73 | /* Specific fix for grid card code blocks */ 74 | .md-typeset .grid.cards .highlight { 75 | margin: 0.5em 0; 76 | } 77 | 78 | .md-typeset .grid.cards .highlight pre { 79 | padding: 0.5em; 80 | font-size: 0.8em; 81 | } 82 | 83 | /* Prevent horizontal scroll for inline code */ 84 | .md-typeset code { 85 | word-break: break-word; 86 | } 87 | 88 | /* Better spacing for admonitions */ 89 | .admonition { 90 | margin: 1.5rem 0; 91 | } 92 | 93 | /* Improve table readability */ 94 | .md-typeset table { 95 | font-size: 0.85rem; 96 | } 97 | 98 | /* Make external links more visible */ 99 | .md-content a[href^="http"]:not(.md-button)::after { 100 | content: " ↗"; 101 | font-size: 0.75em; 102 | vertical-align: super; 103 | opacity: 0.7; 104 | } 105 | 106 | /* Better spacing for navigation expansion arrows */ 107 | .md-nav__icon { 108 | margin-left: 0.2rem; 109 | } 110 | 111 | /* Accessibility improvements */ 112 | /* Ensure focus indicators are visible */ 113 | a:focus, 114 | button:focus, 115 | input:focus, 116 | select:focus, 117 | textarea:focus { 118 | outline: 2px solid var(--md-accent-fg-color); 119 | outline-offset: 2px; 120 | } 121 | 122 | /* Skip to main content link */ 123 | .md-skip { 124 | position: fixed; 125 | top: -40px; 126 | left: 0; 127 | background: var(--md-primary-fg-color); 128 | color: var(--md-primary-bg-color); 129 | padding: 8px; 130 | z-index: 100; 131 | text-decoration: none; 132 | } 133 | 134 | .md-skip:focus { 135 | top: 0; 136 | } 137 | 138 | /* Improve readability with better line height */ 139 | .md-typeset { 140 | line-height: 1.6; 141 | } 142 | 143 | /* Ensure code blocks have sufficient contrast */ 144 | .highlight pre code { 145 | font-size: 0.85rem; 146 | line-height: 1.5; 147 | } 148 | 149 | /* Make interactive elements more obvious */ 150 | .md-typeset .tabbed-set > input:checked + label { 151 | border-bottom: 2px solid var(--md-accent-fg-color); 152 | } 153 | 154 | /* Improve form accessibility */ 155 | .md-search__input { 156 | font-size: 1rem; 157 | } 158 | 159 | /* Screen reader only text utility */ 160 | .sr-only { 161 | position: absolute; 162 | width: 1px; 163 | height: 1px; 164 | padding: 0; 165 | margin: -1px; 166 | overflow: hidden; 167 | clip: rect(0, 0, 0, 0); 168 | white-space: nowrap; 169 | border: 0; 170 | } 171 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_error_scenarios.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for error scenarios and edge cases - fixed version.""" 2 | 3 | import asyncio 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pytest 7 | 8 | from biomcp.exceptions import ( 9 | InvalidDomainError, 10 | ) 11 | from biomcp.rate_limiter import RateLimiter 12 | from biomcp.router import format_results 13 | 14 | 15 | @pytest.fixture(autouse=True) 16 | def enable_metrics_for_concurrent_test(monkeypatch): 17 | """Enable metrics for concurrent test.""" 18 | monkeypatch.setenv("BIOMCP_METRICS_ENABLED", "true") 19 | # Force reload of the module to pick up the new env var 20 | import importlib 21 | 22 | import biomcp.metrics 23 | 24 | importlib.reload(biomcp.metrics) 25 | 26 | 27 | def test_format_results_invalid_domain(): 28 | """Test format_results with invalid domain.""" 29 | with pytest.raises(InvalidDomainError) as exc_info: 30 | format_results([], "invalid_domain", 1, 10, 100) 31 | 32 | assert "invalid_domain" in str(exc_info.value) 33 | assert "Valid domains are:" in str(exc_info.value) 34 | 35 | 36 | def test_format_results_handler_exception(): 37 | """Test format_results when handler raises exception.""" 38 | # Create a result that will cause formatting to fail 39 | bad_result = {"missing": "required_fields"} 40 | 41 | with patch( 42 | "biomcp.domain_handlers.ArticleHandler.format_result" 43 | ) as mock_format: 44 | mock_format.side_effect = KeyError("id") 45 | 46 | # Should handle the error gracefully 47 | result = format_results([bad_result], "article", 1, 10, 100) 48 | 49 | assert result["results"] == [] # Bad result is skipped 50 | 51 | 52 | @pytest.mark.asyncio 53 | async def test_rate_limiter_basic(): 54 | """Test basic rate limiter functionality.""" 55 | # Test normal operation 56 | limiter = RateLimiter(requests_per_second=10, burst_size=5) 57 | 58 | # Should allow burst through context manager 59 | for _ in range(5): 60 | async with limiter.limit(): 61 | pass # Should not raise 62 | 63 | 64 | @pytest.mark.asyncio 65 | async def test_concurrent_operations(): 66 | """Test system behavior under concurrent load.""" 67 | # Clear metrics 68 | from biomcp.metrics import ( 69 | _metrics_collector, 70 | get_metric_summary, 71 | record_metric, 72 | ) 73 | 74 | await _metrics_collector.clear() 75 | 76 | # Simulate concurrent metric recording 77 | async def record_operation(i): 78 | await record_metric( 79 | "concurrent_test", 80 | duration=0.1 * (i % 5), 81 | success=i % 10 != 0, # 10% failure rate 82 | ) 83 | 84 | # Run 100 concurrent operations 85 | tasks = [record_operation(i) for i in range(100)] 86 | await asyncio.gather(*tasks) 87 | 88 | # Check metrics 89 | summary = await get_metric_summary("concurrent_test") 90 | assert summary is not None 91 | assert summary.count == 100 92 | assert summary.error_rate == 0.1 # 10% errors 93 | assert ( 94 | 0.18 <= summary.avg_duration <= 0.22 95 | ) # Average of 0.1, 0.2, 0.3, 0.4 96 | 97 | 98 | def test_cache_corruption_handling(): 99 | """Test handling of corrupted cache data.""" 100 | from biomcp.http_client import get_cached_response 101 | 102 | # Simulate corrupted cache entry 103 | with patch("biomcp.http_client.get_cache") as mock_get_cache: 104 | mock_cache = MagicMock() 105 | mock_cache.get.return_value = "corrupted\x00data" # Invalid data 106 | mock_get_cache.return_value = mock_cache 107 | 108 | # Should handle corrupted data gracefully 109 | result = get_cached_response("test_key") 110 | assert ( 111 | result == "corrupted\x00data" 112 | ) # Returns as-is, parsing handles it 113 | 114 | 115 | def test_exception_hierarchy(): 116 | """Test custom exception hierarchy and messages.""" 117 | # Test InvalidDomainError 118 | exc = InvalidDomainError("bad_domain", ["article", "trial"]) 119 | assert "bad_domain" in str(exc) 120 | assert "article" in str(exc) 121 | ``` -------------------------------------------------------------------------------- /src/biomcp/thinking_tool.py: -------------------------------------------------------------------------------- ```python 1 | """Sequential thinking tool for structured problem-solving. 2 | 3 | This module provides a dedicated MCP tool for sequential thinking, 4 | separate from the main search functionality. 5 | """ 6 | 7 | from typing import Annotated 8 | 9 | from pydantic import Field 10 | 11 | from biomcp.core import mcp_app 12 | from biomcp.metrics import track_performance 13 | from biomcp.thinking.sequential import _sequential_thinking 14 | from biomcp.thinking_tracker import mark_thinking_used 15 | 16 | 17 | @mcp_app.tool() 18 | @track_performance("biomcp.think") 19 | async def think( 20 | thought: Annotated[ 21 | str, 22 | Field(description="Current thinking step for analysis"), 23 | ], 24 | thoughtNumber: Annotated[ 25 | int, 26 | Field( 27 | description="Current thought number, starting at 1", 28 | ge=1, 29 | ), 30 | ], 31 | totalThoughts: Annotated[ 32 | int, 33 | Field( 34 | description="Estimated total thoughts needed for complete analysis", 35 | ge=1, 36 | ), 37 | ], 38 | nextThoughtNeeded: Annotated[ 39 | bool, 40 | Field( 41 | description="Whether more thinking steps are needed after this one", 42 | ), 43 | ] = True, 44 | ) -> dict: 45 | """REQUIRED FIRST STEP: Perform structured sequential thinking for ANY biomedical research task. 46 | 47 | 🚨 IMPORTANT: You MUST use this tool BEFORE any search or fetch operations when: 48 | - Researching ANY biomedical topic (genes, diseases, variants, trials) 49 | - Planning to use multiple BioMCP tools 50 | - Answering questions that require analysis or synthesis 51 | - Comparing information from different sources 52 | - Making recommendations or drawing conclusions 53 | 54 | ⚠️ FAILURE TO USE THIS TOOL FIRST will result in: 55 | - Incomplete or poorly structured analysis 56 | - Missing important connections between data 57 | - Suboptimal search strategies 58 | - Overlooked critical information 59 | 60 | Sequential thinking ensures you: 61 | 1. Fully understand the research question 62 | 2. Plan an optimal search strategy 63 | 3. Identify all relevant data sources 64 | 4. Structure your analysis properly 65 | 5. Deliver comprehensive, well-reasoned results 66 | 67 | ## Usage Pattern: 68 | 1. Start with thoughtNumber=1 to initiate analysis 69 | 2. Progress through numbered thoughts sequentially 70 | 3. Adjust totalThoughts estimate as understanding develops 71 | 4. Set nextThoughtNeeded=False only when analysis is complete 72 | 73 | ## Example: 74 | ```python 75 | # Initial analysis 76 | await think( 77 | thought="Breaking down the relationship between BRAF mutations and melanoma treatment resistance...", 78 | thoughtNumber=1, 79 | totalThoughts=5, 80 | nextThoughtNeeded=True 81 | ) 82 | 83 | # Continue analysis 84 | await think( 85 | thought="Examining specific BRAF V600E mutation mechanisms...", 86 | thoughtNumber=2, 87 | totalThoughts=5, 88 | nextThoughtNeeded=True 89 | ) 90 | 91 | # Final thought 92 | await think( 93 | thought="Synthesizing findings and proposing research directions...", 94 | thoughtNumber=5, 95 | totalThoughts=5, 96 | nextThoughtNeeded=False 97 | ) 98 | ``` 99 | 100 | ## Important Notes: 101 | - Each thought builds on previous ones within a session 102 | - State is maintained throughout the MCP session 103 | - Use thoughtful, detailed analysis in each step 104 | - Revisions and branching are supported through the underlying implementation 105 | """ 106 | # Mark that thinking has been used 107 | mark_thinking_used() 108 | 109 | result = await _sequential_thinking( 110 | thought=thought, 111 | thoughtNumber=thoughtNumber, 112 | totalThoughts=totalThoughts, 113 | nextThoughtNeeded=nextThoughtNeeded, 114 | ) 115 | 116 | return { 117 | "domain": "thinking", 118 | "result": result, 119 | "thoughtNumber": thoughtNumber, 120 | "nextThoughtNeeded": nextThoughtNeeded, 121 | } 122 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_search.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from biomcp.variants.search import ( 4 | ClinicalSignificance, 5 | PolyPhenPrediction, 6 | SiftPrediction, 7 | VariantQuery, 8 | build_query_string, 9 | search_variants, 10 | ) 11 | 12 | 13 | @pytest.fixture 14 | def basic_query(): 15 | """Create a basic gene query.""" 16 | return VariantQuery(gene="BRAF") 17 | 18 | 19 | @pytest.fixture 20 | def complex_query(): 21 | """Create a complex query with multiple parameters.""" 22 | return VariantQuery( 23 | gene="BRCA1", 24 | significance=ClinicalSignificance.PATHOGENIC, 25 | min_frequency=0.0001, 26 | max_frequency=0.01, 27 | ) 28 | 29 | 30 | def test_query_validation(): 31 | """Test VariantQuery model validation.""" 32 | # Test basic query with gene 33 | query = VariantQuery(gene="BRAF") 34 | assert query.gene == "BRAF" 35 | 36 | # Test query with rsid 37 | query = VariantQuery(rsid="rs113488022") 38 | assert query.rsid == "rs113488022" 39 | 40 | # Test query requires at least one search parameter 41 | with pytest.raises(ValueError): 42 | VariantQuery() 43 | 44 | # Test query with clinical significance enum requires a search parameter 45 | query = VariantQuery( 46 | gene="BRCA1", significance=ClinicalSignificance.PATHOGENIC 47 | ) 48 | assert query.significance == ClinicalSignificance.PATHOGENIC 49 | 50 | # Test query with prediction scores 51 | query = VariantQuery( 52 | gene="TP53", 53 | polyphen=PolyPhenPrediction.PROBABLY_DAMAGING, 54 | sift=SiftPrediction.DELETERIOUS, 55 | ) 56 | assert query.polyphen == PolyPhenPrediction.PROBABLY_DAMAGING 57 | assert query.sift == SiftPrediction.DELETERIOUS 58 | 59 | 60 | def test_build_query_string(): 61 | """Test build_query_string function.""" 62 | # Test single field 63 | query = VariantQuery(gene="BRAF") 64 | q_string = build_query_string(query) 65 | assert 'dbnsfp.genename:"BRAF"' in q_string 66 | 67 | # Test multiple fields 68 | query = VariantQuery(gene="BRAF", rsid="rs113488022") 69 | q_string = build_query_string(query) 70 | assert 'dbnsfp.genename:"BRAF"' in q_string 71 | assert "rs113488022" in q_string 72 | 73 | # Test genomic region 74 | query = VariantQuery(region="chr7:140753300-140753400") 75 | q_string = build_query_string(query) 76 | assert "chr7:140753300-140753400" in q_string 77 | 78 | # Test clinical significance 79 | query = VariantQuery(significance=ClinicalSignificance.LIKELY_BENIGN) 80 | q_string = build_query_string(query) 81 | assert 'clinvar.rcv.clinical_significance:"likely benign"' in q_string 82 | 83 | # Test frequency filters 84 | query = VariantQuery(min_frequency=0.0001, max_frequency=0.01) 85 | q_string = build_query_string(query) 86 | assert "gnomad_exome.af.af:>=0.0001" in q_string 87 | assert "gnomad_exome.af.af:<=0.01" in q_string 88 | 89 | 90 | async def test_search_variants_basic(basic_query, anyio_backend): 91 | """Test search_variants function with a basic query.""" 92 | # Use a real API query for a common gene 93 | result = await search_variants(basic_query) 94 | 95 | # Verify we got sensible results 96 | assert "BRAF" in result 97 | assert not result.startswith("Error") 98 | 99 | 100 | async def test_search_variants_complex(complex_query, anyio_backend): 101 | """Test search_variants function with a complex query.""" 102 | # Use a simple common query that will return results 103 | simple_query = VariantQuery(gene="TP53") 104 | result = await search_variants(simple_query) 105 | 106 | # Verify response formatting 107 | assert not result.startswith("Error") 108 | 109 | 110 | async def test_search_variants_no_results(anyio_backend): 111 | """Test search_variants function with a query that returns no results.""" 112 | query = VariantQuery(gene="UNKNOWN_XYZ") 113 | result = await search_variants(query, output_json=True) 114 | assert result == "[]" 115 | 116 | 117 | async def test_search_variants_with_limit(anyio_backend): 118 | """Test search_variants function with size limit.""" 119 | # Query with a small limit 120 | query = VariantQuery(gene="TP53", size=3) 121 | result = await search_variants(query) 122 | 123 | # Result should be valid but limited 124 | assert not result.startswith("Error") 125 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_offline_mode.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for offline mode functionality.""" 2 | 3 | import os 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | 8 | from biomcp.http_client import RequestError, request_api 9 | 10 | 11 | @pytest.mark.asyncio 12 | async def test_offline_mode_blocks_requests(): 13 | """Test that offline mode prevents HTTP requests.""" 14 | # Set offline mode 15 | with patch.dict(os.environ, {"BIOMCP_OFFLINE": "true"}): 16 | # Try to make a request 17 | result, error = await request_api( 18 | url="https://api.example.com/test", 19 | request={"test": "data"}, 20 | cache_ttl=0, # Disable caching for this test 21 | ) 22 | 23 | # Should get an error 24 | assert result is None 25 | assert error is not None 26 | assert isinstance(error, RequestError) 27 | assert error.code == 503 28 | assert "Offline mode enabled" in error.message 29 | 30 | 31 | @pytest.mark.asyncio 32 | async def test_offline_mode_allows_cached_responses(): 33 | """Test that offline mode still returns cached responses.""" 34 | # First, cache a response (with offline mode disabled) 35 | with ( 36 | patch.dict(os.environ, {"BIOMCP_OFFLINE": "false"}), 37 | patch("biomcp.http_client.call_http") as mock_call, 38 | ): 39 | mock_call.return_value = (200, '{"data": "cached"}') 40 | 41 | # Make a request to cache it 42 | result, error = await request_api( 43 | url="https://api.example.com/cached", 44 | request={"test": "data"}, 45 | cache_ttl=3600, # Cache for 1 hour 46 | ) 47 | 48 | assert result == {"data": "cached"} 49 | assert error is None 50 | 51 | # Now enable offline mode 52 | with patch.dict(os.environ, {"BIOMCP_OFFLINE": "true"}): 53 | # Try to get the same request - should return cached result 54 | result, error = await request_api( 55 | url="https://api.example.com/cached", 56 | request={"test": "data"}, 57 | cache_ttl=3600, 58 | ) 59 | 60 | # Should get the cached response 61 | assert result == {"data": "cached"} 62 | assert error is None 63 | 64 | 65 | @pytest.mark.asyncio 66 | async def test_offline_mode_case_insensitive(): 67 | """Test that offline mode environment variable is case insensitive.""" 68 | test_values = ["TRUE", "True", "1", "yes", "YES", "Yes"] 69 | 70 | for value in test_values: 71 | with patch.dict(os.environ, {"BIOMCP_OFFLINE": value}): 72 | result, error = await request_api( 73 | url="https://api.example.com/test", 74 | request={"test": "data"}, 75 | cache_ttl=0, 76 | ) 77 | 78 | assert result is None 79 | assert error is not None 80 | assert error.code == 503 81 | assert "Offline mode enabled" in error.message 82 | 83 | 84 | @pytest.mark.asyncio 85 | async def test_offline_mode_disabled_by_default(): 86 | """Test that offline mode is disabled by default.""" 87 | # Clear the environment variable 88 | with ( 89 | patch.dict(os.environ, {}, clear=True), 90 | patch("biomcp.http_client.call_http") as mock_call, 91 | ): 92 | mock_call.return_value = (200, '{"data": "response"}') 93 | 94 | result, error = await request_api( 95 | url="https://api.example.com/test", 96 | request={"test": "data"}, 97 | cache_ttl=0, 98 | ) 99 | 100 | # Should make the request successfully 101 | assert result == {"data": "response"} 102 | assert error is None 103 | mock_call.assert_called_once() 104 | 105 | 106 | @pytest.mark.asyncio 107 | async def test_offline_mode_with_endpoint_tracking(): 108 | """Test that offline mode works with endpoint tracking.""" 109 | with patch.dict(os.environ, {"BIOMCP_OFFLINE": "true"}): 110 | result, error = await request_api( 111 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/", 112 | request={"text": "BRAF"}, 113 | endpoint_key="pubtator3_search", 114 | cache_ttl=0, 115 | ) 116 | 117 | assert result is None 118 | assert error is not None 119 | assert error.code == 503 120 | assert "pubtator3-api/search/" in error.message 121 | ``` -------------------------------------------------------------------------------- /src/biomcp/variants/links.py: -------------------------------------------------------------------------------- ```python 1 | """Functions for adding database links to variant data.""" 2 | 3 | from typing import Any 4 | 5 | 6 | def _calculate_vcf_end(variant: dict[str, Any]) -> int: 7 | """Calculate the end position for UCSC Genome Browser link.""" 8 | if "vcf" not in variant: 9 | return 0 10 | 11 | vcf = variant["vcf"] 12 | pos = int(vcf.get("position", 0)) 13 | ref = vcf.get("ref", "") 14 | alt = vcf.get("alt", "") 15 | 16 | # For insertions/deletions, handle special cases 17 | if not ref and alt: # insertion 18 | return pos + 1 19 | elif ref and not alt: # deletion 20 | return pos + len(ref) 21 | else: # substitution 22 | return pos + max(0, ((len(alt) + 1) - len(ref))) 23 | 24 | 25 | def _get_first_value(data: Any) -> Any: 26 | """Get the first value from a list or return the value itself.""" 27 | if isinstance(data, list) and data: 28 | return data[0] 29 | return data 30 | 31 | 32 | def _ensure_url_section(variant: dict[str, Any]) -> None: 33 | """Ensure the URL section exists in the variant.""" 34 | if "url" not in variant: 35 | variant["url"] = {} 36 | 37 | 38 | def _add_dbsnp_links(variant: dict[str, Any]) -> None: 39 | """Add dbSNP and Ensembl links if rsid is present.""" 40 | if "dbsnp" in variant and variant["dbsnp"].get("rsid"): 41 | variant["dbsnp"]["url"] = ( 42 | f"https://www.ncbi.nlm.nih.gov/snp/{variant['dbsnp']['rsid']}" 43 | ) 44 | _ensure_url_section(variant) 45 | variant["url"]["ensembl"] = ( 46 | f"https://ensembl.org/Homo_sapiens/Variation/Explore?v={variant['dbsnp']['rsid']}" 47 | ) 48 | 49 | 50 | def _add_clinvar_link(variant: dict[str, Any]) -> None: 51 | """Add ClinVar link if variant_id is present.""" 52 | if "clinvar" in variant and variant["clinvar"].get("variant_id"): 53 | variant["clinvar"]["url"] = ( 54 | f"https://www.ncbi.nlm.nih.gov/clinvar/variation/{variant['clinvar']['variant_id']}/" 55 | ) 56 | 57 | 58 | def _add_cosmic_link(variant: dict[str, Any]) -> None: 59 | """Add COSMIC link if cosmic_id is present.""" 60 | if "cosmic" in variant and variant["cosmic"].get("cosmic_id"): 61 | variant["cosmic"]["url"] = ( 62 | f"https://cancer.sanger.ac.uk/cosmic/mutation/overview?id={variant['cosmic']['cosmic_id']}" 63 | ) 64 | 65 | 66 | def _add_civic_link(variant: dict[str, Any]) -> None: 67 | """Add CIViC link if id is present.""" 68 | if "civic" in variant and variant["civic"].get("id"): 69 | variant["civic"]["url"] = ( 70 | f"https://civicdb.org/variants/{variant['civic']['id']}/summary" 71 | ) 72 | 73 | 74 | def _add_ucsc_link(variant: dict[str, Any]) -> None: 75 | """Add UCSC Genome Browser link if chromosome and position are present.""" 76 | if ( 77 | "chrom" in variant 78 | and "vcf" in variant 79 | and variant["vcf"].get("position") 80 | ): 81 | vcf_end = _calculate_vcf_end(variant) 82 | _ensure_url_section(variant) 83 | variant["url"]["ucsc_genome_browser"] = ( 84 | f"https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&" 85 | f"position=chr{variant['chrom']}:{variant['vcf']['position']}-{vcf_end}" 86 | ) 87 | 88 | 89 | def _add_hgnc_link(variant: dict[str, Any]) -> None: 90 | """Add HGNC link if gene name is present.""" 91 | if "dbnsfp" in variant and variant["dbnsfp"].get("genename"): 92 | gene = _get_first_value(variant["dbnsfp"]["genename"]) 93 | if gene: 94 | _ensure_url_section(variant) 95 | variant["url"]["hgnc"] = ( 96 | f"https://www.genenames.org/data/gene-symbol-report/#!/symbol/{gene}" 97 | ) 98 | 99 | 100 | def inject_links(variants: list[dict[str, Any]]) -> list[dict[str, Any]]: 101 | """ 102 | Inject database links into variant data. 103 | 104 | Args: 105 | variants: List of variant dictionaries from MyVariant.info API 106 | 107 | Returns: 108 | List of variant dictionaries with added URL links in appropriate sections 109 | """ 110 | for variant in variants: 111 | _add_dbsnp_links(variant) 112 | _add_clinvar_link(variant) 113 | _add_cosmic_link(variant) 114 | _add_civic_link(variant) 115 | _add_ucsc_link(variant) 116 | _add_hgnc_link(variant) 117 | 118 | return variants 119 | ``` -------------------------------------------------------------------------------- /src/biomcp/organizations/getter.py: -------------------------------------------------------------------------------- ```python 1 | """Get specific organization details via NCI CTS API.""" 2 | 3 | import logging 4 | from typing import Any 5 | 6 | from ..constants import NCI_ORGANIZATIONS_URL 7 | from ..integrations.cts_api import CTSAPIError, make_cts_request 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | async def get_organization( 13 | org_id: str, 14 | api_key: str | None = None, 15 | ) -> dict[str, Any]: 16 | """ 17 | Get detailed information about a specific organization. 18 | 19 | Args: 20 | org_id: Organization ID 21 | api_key: Optional API key (if not provided, uses NCI_API_KEY env var) 22 | 23 | Returns: 24 | Dictionary with organization details 25 | 26 | Raises: 27 | CTSAPIError: If the API request fails or organization not found 28 | """ 29 | try: 30 | # Make API request 31 | url = f"{NCI_ORGANIZATIONS_URL}/{org_id}" 32 | response = await make_cts_request( 33 | url=url, 34 | api_key=api_key, 35 | ) 36 | 37 | # Return the organization data 38 | # Handle different possible response formats 39 | if "data" in response: 40 | return response["data"] 41 | elif "organization" in response: 42 | return response["organization"] 43 | else: 44 | return response 45 | 46 | except CTSAPIError: 47 | raise 48 | except Exception as e: 49 | logger.error(f"Failed to get organization {org_id}: {e}") 50 | raise CTSAPIError(f"Failed to retrieve organization: {e!s}") from e 51 | 52 | 53 | def _format_address_fields(org: dict[str, Any]) -> list[str]: 54 | """Extract and format address fields from organization data.""" 55 | address_fields = [] 56 | 57 | if org.get("address"): 58 | addr = org["address"] 59 | if isinstance(addr, dict): 60 | fields = [ 61 | addr.get("street", ""), 62 | addr.get("city", ""), 63 | addr.get("state", ""), 64 | addr.get("zip", ""), 65 | ] 66 | address_fields = [f for f in fields if f] 67 | 68 | country = addr.get("country", "") 69 | if country and country != "United States": 70 | address_fields.append(country) 71 | else: 72 | # Try individual fields 73 | city = org.get("city", "") 74 | state = org.get("state", "") 75 | address_fields = [p for p in [city, state] if p] 76 | 77 | return address_fields 78 | 79 | 80 | def _format_contact_info(org: dict[str, Any]) -> list[str]: 81 | """Format contact information lines.""" 82 | lines = [] 83 | if org.get("phone"): 84 | lines.append(f"- **Phone**: {org['phone']}") 85 | if org.get("email"): 86 | lines.append(f"- **Email**: {org['email']}") 87 | if org.get("website"): 88 | lines.append(f"- **Website**: {org['website']}") 89 | return lines 90 | 91 | 92 | def format_organization_details(org: dict[str, Any]) -> str: 93 | """ 94 | Format organization details as markdown. 95 | 96 | Args: 97 | org: Organization data dictionary 98 | 99 | Returns: 100 | Formatted markdown string 101 | """ 102 | # Extract fields with defaults 103 | org_id = org.get("id", org.get("org_id", "Unknown")) 104 | name = org.get("name", "Unknown Organization") 105 | org_type = org.get("type", org.get("category", "Unknown")) 106 | 107 | # Build markdown output 108 | lines = [ 109 | f"## Organization: {name}", 110 | "", 111 | "### Basic Information", 112 | f"- **ID**: {org_id}", 113 | f"- **Type**: {org_type}", 114 | ] 115 | 116 | # Add location if available 117 | address_fields = _format_address_fields(org) 118 | if address_fields: 119 | lines.append(f"- **Location**: {', '.join(address_fields)}") 120 | 121 | # Add contact info 122 | lines.extend(_format_contact_info(org)) 123 | 124 | # Add description if available 125 | if org.get("description"): 126 | lines.extend([ 127 | "", 128 | "### Description", 129 | org["description"], 130 | ]) 131 | 132 | # Add parent organization metadata 133 | if org.get("parent_org"): 134 | lines.extend([ 135 | "", 136 | "### Parent Organization", 137 | f"- **Name**: {org['parent_org'].get('name', 'Unknown')}", 138 | f"- **ID**: {org['parent_org'].get('id', 'Unknown')}", 139 | ]) 140 | 141 | return "\n".join(lines) 142 | ``` -------------------------------------------------------------------------------- /tests/tdd/utils/test_request_cache.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for request caching utilities.""" 2 | 3 | import asyncio 4 | 5 | import pytest 6 | 7 | from biomcp.utils.request_cache import ( 8 | clear_cache, 9 | get_cached, 10 | request_cache, 11 | set_cached, 12 | ) 13 | 14 | 15 | class TestRequestCache: 16 | """Test request caching functionality.""" 17 | 18 | @pytest.fixture(autouse=True) 19 | async def clear_cache_before_test(self): 20 | """Clear cache before each test.""" 21 | await clear_cache() 22 | yield 23 | await clear_cache() 24 | 25 | @pytest.mark.asyncio 26 | async def test_basic_caching(self): 27 | """Test basic cache get/set operations.""" 28 | # Initially should be empty 29 | result = await get_cached("test_key") 30 | assert result is None 31 | 32 | # Set a value 33 | await set_cached("test_key", "test_value", ttl=10) 34 | 35 | # Should retrieve the value 36 | result = await get_cached("test_key") 37 | assert result == "test_value" 38 | 39 | @pytest.mark.asyncio 40 | async def test_cache_expiry(self): 41 | """Test that cached values expire.""" 42 | # Set with very short TTL 43 | await set_cached("test_key", "test_value", ttl=0.1) 44 | 45 | # Should be available immediately 46 | result = await get_cached("test_key") 47 | assert result == "test_value" 48 | 49 | # Wait for expiry 50 | await asyncio.sleep(0.2) 51 | 52 | # Should be expired 53 | result = await get_cached("test_key") 54 | assert result is None 55 | 56 | @pytest.mark.asyncio 57 | async def test_request_cache_decorator(self): 58 | """Test the @request_cache decorator.""" 59 | call_count = 0 60 | 61 | @request_cache(ttl=10) 62 | async def expensive_function(arg1, arg2): 63 | nonlocal call_count 64 | call_count += 1 65 | return f"{arg1}-{arg2}-{call_count}" 66 | 67 | # First call should execute function 68 | result1 = await expensive_function("a", "b") 69 | assert result1 == "a-b-1" 70 | assert call_count == 1 71 | 72 | # Second call with same args should use cache 73 | result2 = await expensive_function("a", "b") 74 | assert result2 == "a-b-1" # Same result 75 | assert call_count == 1 # Function not called again 76 | 77 | # Different args should execute function 78 | result3 = await expensive_function("c", "d") 79 | assert result3 == "c-d-2" 80 | assert call_count == 2 81 | 82 | @pytest.mark.asyncio 83 | async def test_skip_cache_option(self): 84 | """Test that skip_cache bypasses caching.""" 85 | call_count = 0 86 | 87 | @request_cache(ttl=10) 88 | async def cached_function(): 89 | nonlocal call_count 90 | call_count += 1 91 | return call_count 92 | 93 | # Normal call - cached 94 | result1 = await cached_function() 95 | assert result1 == 1 96 | 97 | # Skip cache - new execution 98 | result2 = await cached_function(skip_cache=True) 99 | assert result2 == 2 100 | 101 | # Normal call again - still cached 102 | result3 = await cached_function() 103 | assert result3 == 1 104 | 105 | @pytest.mark.asyncio 106 | async def test_none_values_not_cached(self): 107 | """Test that None return values are not cached.""" 108 | call_count = 0 109 | 110 | @request_cache(ttl=10) 111 | async def sometimes_none_function(return_none=False): 112 | nonlocal call_count 113 | call_count += 1 114 | return None if return_none else call_count 115 | 116 | # Return None - should not cache 117 | result1 = await sometimes_none_function(return_none=True) 118 | assert result1 is None 119 | assert call_count == 1 120 | 121 | # Call again - should execute again (not cached) 122 | result2 = await sometimes_none_function(return_none=True) 123 | assert result2 is None 124 | assert call_count == 2 125 | 126 | # Return value - should cache 127 | result3 = await sometimes_none_function(return_none=False) 128 | assert result3 == 3 129 | assert call_count == 3 130 | 131 | # Call again - should use cache 132 | result4 = await sometimes_none_function(return_none=False) 133 | assert result4 == 3 134 | assert call_count == 3 135 | ``` -------------------------------------------------------------------------------- /docs/blog/ai-assisted-clinical-trial-search-analysis.md: -------------------------------------------------------------------------------- ```markdown 1 | # AI-Assisted Clinical Trial Search: How BioMCP Transforms Research 2 | 3 | Finding the right clinical trial for a research project has traditionally been 4 | a complex process requiring specialized knowledge of database syntax and 5 | medical terminology. BioMCP is changing this landscape by making clinical trial 6 | data accessible through natural language conversation. 7 | 8 | Video Link: 9 | [](https://www.youtube.com/watch?v=jqGXXnVesjg&list=PLu1amIF_MEfPWhhEsXSuBi90S_xtmVJIW&index=2) 10 | 11 | ## Breaking Down the Barriers to Clinical Trial Information 12 | 13 | BioMCP serves as a specialized Model Context Protocol (MCP) server that 14 | empowers AI assistants and agents with tools to interact with critical 15 | biomedical resources. For clinical trials specifically, BioMCP connects to the 16 | ClinicalTrials.gov API, allowing researchers and clinicians to search and 17 | retrieve trial information through simple conversational queries. 18 | 19 | The power of this approach becomes apparent when we look at how it transforms a 20 | complex search requirement. Imagine needing to find active clinical trials for 21 | pembrolizumab (a cancer immunotherapy drug) specifically for non-small cell 22 | lung carcinoma near Cleveland, Ohio. Traditionally, this would require: 23 | 24 | 1. Navigating to ClinicalTrials.gov 25 | 2. Understanding the proper search fields and syntax 26 | 3. Creating multiple filters for intervention (pembrolizumab), condition ( 27 | non-small cell lung carcinoma), status (recruiting), and location (Cleveland 28 | area) 29 | 4. Interpreting the results 30 | 31 | ## From Natural Language to Precise Database Queries 32 | 33 | With BioMCP, this entire process is streamlined into a simple natural language 34 | request. The underlying large language model (LLM) interprets the query, 35 | identifies the key entities (drug name, cancer type, location), and translates 36 | these into the precise parameters needed for the ClinicalTrials.gov API. 37 | 38 | The system returns relevant trials that match all criteria, presenting them in 39 | an easy-to-understand format. But the interaction doesn't end there—BioMCP 40 | maintains context throughout the conversation, enabling follow-up questions 41 | like: 42 | 43 | - Where exactly are these trials located and how far are they from downtown 44 | Cleveland? 45 | - What biomarker eligibility criteria do these trials require? 46 | - Are there exclusion criteria I should be aware of? 47 | 48 | For each of these questions, BioMCP calls the appropriate tool (trial 49 | locations, trial protocols) and processes the information to provide meaningful 50 | answers without requiring the user to navigate different interfaces or learn 51 | new query languages. 52 | 53 | ## Beyond Basic Search: Understanding Trial Details 54 | 55 | What truly sets BioMCP apart is its ability to go beyond simple listings. When 56 | asked about biomarker eligibility criteria, the system can extract this 57 | information from the full trial protocol, synthesize it, and present a clear 58 | summary of requirements. This capability transforms what would typically be 59 | hours of reading dense clinical documentation into a conversational exchange 60 | that delivers precisely what the researcher needs. 61 | 62 | ## Transforming Clinical Research Workflows 63 | 64 | The implications for clinical research are significant. By lowering the 65 | technical barriers to accessing trial information, BioMCP can help: 66 | 67 | - Researchers understand the landscape of current research in their field 68 | - Research teams identify promising studies more efficiently 69 | - Clinical research organizations track competing or complementary trials 70 | - Research coordinators identify potential recruitment sites based on location 71 | 72 | As part of the broader BioMCP ecosystem—which also includes access to genomic 73 | variant information and PubMed literature—this clinical trial search capability 74 | represents a fundamental shift in how we interact with biomedical information. 75 | By bringing the power of natural language processing to specialized databases, 76 | BioMCP is helping to democratize access to critical health information and 77 | accelerate the research process. 78 | ``` -------------------------------------------------------------------------------- /src/biomcp/utils/query_utils.py: -------------------------------------------------------------------------------- ```python 1 | """Utilities for query parsing and manipulation.""" 2 | 3 | import re 4 | from typing import Any 5 | 6 | 7 | def parse_or_query(query: str) -> list[str]: 8 | """Parse OR query into individual search terms. 9 | 10 | Handles formats like: 11 | - "term1 OR term2" 12 | - 'term1 OR term2 OR "term with spaces"' 13 | - "TERM1 or term2 or term3" (case insensitive) 14 | 15 | Args: 16 | query: Query string that may contain OR operators 17 | 18 | Returns: 19 | List of individual search terms with quotes and whitespace cleaned 20 | 21 | Examples: 22 | >>> parse_or_query("PD-L1 OR CD274") 23 | ['PD-L1', 'CD274'] 24 | 25 | >>> parse_or_query('BRAF OR "v-raf murine" OR ARAF') 26 | ['BRAF', 'v-raf murine', 'ARAF'] 27 | """ 28 | # Split by OR (case insensitive) 29 | terms = re.split(r"\s+OR\s+", query, flags=re.IGNORECASE) 30 | 31 | # Clean up each term - remove quotes and extra whitespace 32 | cleaned_terms = [] 33 | for term in terms: 34 | # Remove surrounding quotes (both single and double) 35 | term = term.strip().strip('"').strip("'").strip() 36 | if term: 37 | cleaned_terms.append(term) 38 | 39 | return cleaned_terms 40 | 41 | 42 | def contains_or_operator(query: str) -> bool: 43 | """Check if a query contains OR operators. 44 | 45 | Args: 46 | query: Query string to check 47 | 48 | Returns: 49 | True if query contains " OR " or " or ", False otherwise 50 | """ 51 | return " OR " in query or " or " in query 52 | 53 | 54 | async def search_with_or_support( 55 | query: str, 56 | search_func: Any, 57 | search_params: dict[str, Any], 58 | id_field: str = "id", 59 | fallback_id_field: str | None = None, 60 | ) -> dict[str, Any]: 61 | """Generic OR query search handler. 62 | 63 | This function handles OR queries by making multiple API calls and combining results. 64 | 65 | Args: 66 | query: Query string that may contain OR operators 67 | search_func: Async search function to call for each term 68 | search_params: Base parameters to pass to search function (excluding the query term) 69 | id_field: Primary field name for deduplication (default: "id") 70 | fallback_id_field: Alternative field name if primary is missing 71 | 72 | Returns: 73 | Combined results from all searches with duplicates removed 74 | """ 75 | # Check if this is an OR query 76 | if contains_or_operator(query): 77 | search_terms = parse_or_query(query) 78 | else: 79 | search_terms = [query] 80 | 81 | # Collect all unique results 82 | all_results = {} 83 | total_found = 0 84 | 85 | # Search for each term 86 | for term in search_terms: 87 | try: 88 | # Call the search function with the term 89 | results = await search_func(**{**search_params, "name": term}) 90 | 91 | # Extract results list (handle different response formats) 92 | items_key = None 93 | for key in [ 94 | "biomarkers", 95 | "organizations", 96 | "interventions", 97 | "diseases", 98 | "data", 99 | "items", 100 | ]: 101 | if key in results: 102 | items_key = key 103 | break 104 | 105 | if not items_key: 106 | continue 107 | 108 | # Add unique items (deduplicate by ID) 109 | for item in results.get(items_key, []): 110 | item_id = item.get(id_field) 111 | if not item_id and fallback_id_field: 112 | item_id = item.get(fallback_id_field) 113 | 114 | if item_id and item_id not in all_results: 115 | all_results[item_id] = item 116 | 117 | total_found += results.get("total", 0) 118 | 119 | except Exception as e: 120 | # Log the error and continue with other terms 121 | import logging 122 | 123 | logger = logging.getLogger(__name__) 124 | logger.warning(f"Failed to search for term '{term}': {e}") 125 | continue 126 | 127 | # Convert back to list 128 | unique_items = list(all_results.values()) 129 | 130 | # Return in standard format 131 | return { 132 | "items": unique_items, 133 | "total": len(unique_items), 134 | "search_terms": search_terms, 135 | "total_found_across_terms": total_found, 136 | } 137 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_endpoint_documentation.py: -------------------------------------------------------------------------------- ```python 1 | """Test that endpoint documentation is kept up to date.""" 2 | 3 | import subprocess 4 | import sys 5 | from pathlib import Path 6 | 7 | 8 | class TestEndpointDocumentation: 9 | """Test the endpoint documentation generation.""" 10 | 11 | def test_third_party_endpoints_file_exists(self): 12 | """Test that THIRD_PARTY_ENDPOINTS.md exists.""" 13 | endpoints_file = ( 14 | Path(__file__).parent.parent.parent / "THIRD_PARTY_ENDPOINTS.md" 15 | ) 16 | assert endpoints_file.exists(), "THIRD_PARTY_ENDPOINTS.md must exist" 17 | 18 | def test_endpoints_documentation_is_current(self): 19 | """Test that the endpoints documentation can be generated without errors.""" 20 | # Run the generation script 21 | script_path = ( 22 | Path(__file__).parent.parent.parent 23 | / "scripts" 24 | / "generate_endpoints_doc.py" 25 | ) 26 | result = subprocess.run( # noqa: S603 27 | [sys.executable, str(script_path)], 28 | capture_output=True, 29 | text=True, 30 | check=False, 31 | ) 32 | 33 | assert result.returncode == 0, f"Script failed: {result.stderr}" 34 | 35 | # The script should report that it generated the file 36 | assert ( 37 | "Generated" in result.stdout or result.stdout == "" 38 | ), f"Unexpected output: {result.stdout}" 39 | 40 | def test_all_endpoints_documented(self): 41 | """Test that all endpoints in the registry are documented.""" 42 | from biomcp.utils.endpoint_registry import get_registry 43 | 44 | registry = get_registry() 45 | endpoints = registry.get_all_endpoints() 46 | 47 | # Read the documentation 48 | endpoints_file = ( 49 | Path(__file__).parent.parent.parent / "THIRD_PARTY_ENDPOINTS.md" 50 | ) 51 | content = endpoints_file.read_text() 52 | 53 | # Check each endpoint is mentioned 54 | for key, info in endpoints.items(): 55 | assert key in content, f"Endpoint {key} not found in documentation" 56 | assert ( 57 | info.url in content 58 | ), f"URL {info.url} not found in documentation" 59 | 60 | def test_documentation_contains_required_sections(self): 61 | """Test that documentation contains all required sections.""" 62 | endpoints_file = ( 63 | Path(__file__).parent.parent.parent / "THIRD_PARTY_ENDPOINTS.md" 64 | ) 65 | content = endpoints_file.read_text() 66 | 67 | required_sections = [ 68 | "# Third-Party Endpoints Used by BioMCP", 69 | "## Overview", 70 | "## Endpoints by Category", 71 | "### Biomedical Literature", 72 | "### Clinical Trials", 73 | "### Variant Databases", 74 | "### Cancer Genomics", 75 | "## Domain Summary", 76 | "## Compliance and Privacy", 77 | "## Network Control", 78 | "BIOMCP_OFFLINE", 79 | ] 80 | 81 | for section in required_sections: 82 | assert ( 83 | section in content 84 | ), f"Required section '{section}' not found in documentation" 85 | 86 | def test_endpoint_counts_accurate(self): 87 | """Test that endpoint counts in the overview are accurate.""" 88 | from biomcp.utils.endpoint_registry import get_registry 89 | 90 | registry = get_registry() 91 | endpoints = registry.get_all_endpoints() 92 | domains = registry.get_unique_domains() 93 | 94 | endpoints_file = ( 95 | Path(__file__).parent.parent.parent / "THIRD_PARTY_ENDPOINTS.md" 96 | ) 97 | content = endpoints_file.read_text() 98 | 99 | # Extract counts from overview 100 | import re 101 | 102 | match = re.search( 103 | r"BioMCP connects to (\d+) external domains across (\d+) endpoints", 104 | content, 105 | ) 106 | 107 | assert match, "Could not find endpoint counts in overview" 108 | 109 | doc_domains = int(match.group(1)) 110 | doc_endpoints = int(match.group(2)) 111 | 112 | assert ( 113 | doc_domains == len(domains) 114 | ), f"Document says {doc_domains} domains but registry has {len(domains)}" 115 | assert ( 116 | doc_endpoints == len(endpoints) 117 | ), f"Document says {doc_endpoints} endpoints but registry has {len(endpoints)}" 118 | ``` -------------------------------------------------------------------------------- /src/biomcp/cli/organizations.py: -------------------------------------------------------------------------------- ```python 1 | """CLI commands for organization search and lookup.""" 2 | 3 | import asyncio 4 | from typing import Annotated 5 | 6 | import typer 7 | 8 | from ..integrations.cts_api import CTSAPIError, get_api_key_instructions 9 | from ..organizations import get_organization, search_organizations 10 | from ..organizations.getter import format_organization_details 11 | from ..organizations.search import format_organization_results 12 | 13 | organization_app = typer.Typer( 14 | no_args_is_help=True, 15 | help="Search and retrieve organization information from NCI CTS API", 16 | ) 17 | 18 | 19 | @organization_app.command("search") 20 | def search_organizations_cli( 21 | name: Annotated[ 22 | str | None, 23 | typer.Argument( 24 | help="Organization name to search for (partial match supported)" 25 | ), 26 | ] = None, 27 | org_type: Annotated[ 28 | str | None, 29 | typer.Option( 30 | "--type", 31 | help="Type of organization (e.g., industry, academic)", 32 | ), 33 | ] = None, 34 | city: Annotated[ 35 | str | None, 36 | typer.Option( 37 | "--city", 38 | help="City location", 39 | ), 40 | ] = None, 41 | state: Annotated[ 42 | str | None, 43 | typer.Option( 44 | "--state", 45 | help="State location (2-letter code)", 46 | ), 47 | ] = None, 48 | page_size: Annotated[ 49 | int, 50 | typer.Option( 51 | "--page-size", 52 | help="Number of results per page", 53 | min=1, 54 | max=100, 55 | ), 56 | ] = 20, 57 | page: Annotated[ 58 | int, 59 | typer.Option( 60 | "--page", 61 | help="Page number", 62 | min=1, 63 | ), 64 | ] = 1, 65 | api_key: Annotated[ 66 | str | None, 67 | typer.Option( 68 | "--api-key", 69 | help="NCI API key (overrides NCI_API_KEY env var)", 70 | envvar="NCI_API_KEY", 71 | ), 72 | ] = None, 73 | ) -> None: 74 | """ 75 | Search for organizations in the NCI Clinical Trials database. 76 | 77 | Examples: 78 | # Search by name 79 | biomcp organization search "MD Anderson" 80 | 81 | # Search by type 82 | biomcp organization search --type academic 83 | 84 | # Search by location 85 | biomcp organization search --city Boston --state MA 86 | 87 | # Combine filters 88 | biomcp organization search Cancer --type industry --state CA 89 | """ 90 | try: 91 | results = asyncio.run( 92 | search_organizations( 93 | name=name, 94 | org_type=org_type, 95 | city=city, 96 | state=state, 97 | page_size=page_size, 98 | page=page, 99 | api_key=api_key, 100 | ) 101 | ) 102 | 103 | output = format_organization_results(results) 104 | typer.echo(output) 105 | 106 | except CTSAPIError as e: 107 | if "API key required" in str(e): 108 | typer.echo(get_api_key_instructions()) 109 | else: 110 | typer.echo(f"Error: {e}", err=True) 111 | raise typer.Exit(1) from e 112 | except Exception as e: 113 | typer.echo(f"Unexpected error: {e}", err=True) 114 | raise typer.Exit(1) from e 115 | 116 | 117 | @organization_app.command("get") 118 | def get_organization_cli( 119 | org_id: Annotated[ 120 | str, 121 | typer.Argument(help="Organization ID"), 122 | ], 123 | api_key: Annotated[ 124 | str | None, 125 | typer.Option( 126 | "--api-key", 127 | help="NCI API key (overrides NCI_API_KEY env var)", 128 | envvar="NCI_API_KEY", 129 | ), 130 | ] = None, 131 | ) -> None: 132 | """ 133 | Get detailed information about a specific organization. 134 | 135 | Example: 136 | biomcp organization get ORG123456 137 | """ 138 | try: 139 | org_data = asyncio.run( 140 | get_organization( 141 | org_id=org_id, 142 | api_key=api_key, 143 | ) 144 | ) 145 | 146 | output = format_organization_details(org_data) 147 | typer.echo(output) 148 | 149 | except CTSAPIError as e: 150 | if "API key required" in str(e): 151 | typer.echo(get_api_key_instructions()) 152 | else: 153 | typer.echo(f"Error: {e}", err=True) 154 | raise typer.Exit(1) from e 155 | except Exception as e: 156 | typer.echo(f"Unexpected error: {e}", err=True) 157 | raise typer.Exit(1) from e 158 | ``` -------------------------------------------------------------------------------- /tests/bdd/search_variants/test_search.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import shlex 3 | from typing import Any 4 | 5 | from assertpy import assert_that 6 | from pytest_bdd import parsers, scenarios, then, when 7 | from typer.testing import CliRunner 8 | 9 | from biomcp.cli import app 10 | 11 | scenarios("search.feature") 12 | 13 | runner = CliRunner() 14 | 15 | # Field mapping - Updated chromosome key 16 | FIELD_MAP = { 17 | "chromosome": ["chrom"], 18 | "frequency": ["gnomad_exome", "af", "af"], 19 | "gene": ["dbnsfp", "genename"], 20 | "hgvsc": ["dbnsfp", "hgvsc"], 21 | "hgvsp": ["dbnsfp", "hgvsp"], 22 | "cadd": ["cadd", "phred"], 23 | "polyphen": ["dbnsfp", "polyphen2", "hdiv", "pred"], 24 | "position": ["vcf", "position"], 25 | "rsid": ["dbsnp", "rsid"], 26 | "sift": ["dbnsfp", "sift", "pred"], 27 | "significance": ["clinvar", "rcv", "clinical_significance"], 28 | "uniprot_id": ["mutdb", "uniprot_id"], 29 | } 30 | 31 | 32 | def get_value(data: dict, key: str) -> Any | None: 33 | """Extract value from nested dictionary using field mapping.""" 34 | key_path = FIELD_MAP.get(key, [key]) 35 | current_value = data.get("hits") 36 | for key in key_path: 37 | if isinstance(current_value, dict): 38 | current_value = current_value.get(key) 39 | elif isinstance(current_value, list): 40 | current_value = current_value[0].get(key) 41 | if current_value and isinstance(current_value, list): 42 | return current_value[0] 43 | return current_value 44 | 45 | 46 | # --- @when Step --- 47 | @when( 48 | parsers.re(r'I run "(?P<command>.*?)"(?: #.*)?$'), 49 | target_fixture="variants_data", 50 | ) 51 | def variants_data(command) -> dict: 52 | """Run variant search command with --json and return parsed results.""" 53 | args = shlex.split(command)[1:] # trim 'biomcp' 54 | args += ["--json"] 55 | if "--size" not in args: 56 | args.extend(["--size", "10"]) 57 | 58 | result = runner.invoke(app, args, catch_exceptions=False) 59 | assert result.exit_code == 0, "CLI command failed" 60 | data = json.loads(result.stdout) 61 | return data 62 | 63 | 64 | def normalize(v): 65 | try: 66 | return float(v) 67 | except ValueError: 68 | try: 69 | return int(v) 70 | except ValueError: 71 | return v.lower() 72 | 73 | 74 | @then( 75 | parsers.re( 76 | r"each variant should have (?P<field>\w+) that (?P<operator>(?:is|equal|to|contains|greater|less|than|or|\s)+)\s+(?P<expected>.+)$" 77 | ) 78 | ) 79 | def check_variant_field(it, variants_data, field, operator, expected): 80 | """ 81 | For each variant, apply an assertpy operator against a given field. 82 | Supports operator names with spaces (e.g. "is equal to") or underscores (e.g. "is_equal_to"). 83 | """ 84 | # Normalize operator: lower case and replace spaces with underscores. 85 | operator = operator.strip().lower().replace(" ", "_") 86 | successes = set() 87 | failures = set() 88 | for v_num, value in it(FIELD_MAP, variants_data, field): 89 | value = normalize(value) 90 | expected = normalize(expected) 91 | f = getattr(assert_that(value), operator) 92 | try: 93 | f(expected) 94 | successes.add(v_num) 95 | except AssertionError: 96 | failures.add(v_num) 97 | 98 | failures -= successes 99 | assert len(failures) == 0, f"Failure: {field} {operator} {expected}" 100 | 101 | 102 | @then( 103 | parsers.re( 104 | r"the number of variants (?P<operator>(?:is|equal|to|contains|greater|less|than|or|\s)+)\s+(?P<expected>\d+)$" 105 | ) 106 | ) 107 | def number_of_variants_check(variants_data, operator, expected): 108 | """Check the number of variants returned.""" 109 | if ( 110 | isinstance(variants_data, list) 111 | and len(variants_data) == 1 112 | and "error" in variants_data[0] 113 | ): 114 | count = 0 # If we have an error response, count as 0 variants 115 | elif isinstance(variants_data, dict) and "variants" in variants_data: 116 | # Handle new format with cBioPortal summary 117 | count = len(variants_data["variants"]) 118 | elif isinstance(variants_data, dict) and "hits" in variants_data: 119 | # Handle myvariant.info response format 120 | count = len(variants_data["hits"]) 121 | else: 122 | count = len(variants_data) if isinstance(variants_data, list) else 0 123 | operator = operator.strip().lower().replace(" ", "_") 124 | f = getattr(assert_that(count), operator) 125 | f(int(expected)) 126 | ``` -------------------------------------------------------------------------------- /src/biomcp/cli/diseases.py: -------------------------------------------------------------------------------- ```python 1 | """CLI commands for disease information and search.""" 2 | 3 | import asyncio 4 | from typing import Annotated 5 | 6 | import typer 7 | 8 | from ..diseases import get_disease 9 | from ..diseases.search import format_disease_results, search_diseases 10 | from ..integrations.cts_api import CTSAPIError, get_api_key_instructions 11 | 12 | disease_app = typer.Typer( 13 | no_args_is_help=True, 14 | help="Search and retrieve disease information", 15 | ) 16 | 17 | 18 | @disease_app.command("get") 19 | def get_disease_cli( 20 | disease_name: Annotated[ 21 | str, 22 | typer.Argument(help="Disease name or identifier"), 23 | ], 24 | ) -> None: 25 | """ 26 | Get disease information from MyDisease.info. 27 | 28 | This returns detailed information including synonyms, definitions, 29 | and database cross-references. 30 | 31 | Examples: 32 | biomcp disease get melanoma 33 | biomcp disease get "lung cancer" 34 | biomcp disease get GIST 35 | """ 36 | result = asyncio.run(get_disease(disease_name)) 37 | typer.echo(result) 38 | 39 | 40 | @disease_app.command("search") 41 | def search_diseases_cli( 42 | name: Annotated[ 43 | str | None, 44 | typer.Argument( 45 | help="Disease name to search for (partial match supported)" 46 | ), 47 | ] = None, 48 | include_synonyms: Annotated[ 49 | bool, 50 | typer.Option( 51 | "--synonyms/--no-synonyms", 52 | help="[Deprecated] This option is ignored - API always searches synonyms", 53 | ), 54 | ] = True, 55 | category: Annotated[ 56 | str | None, 57 | typer.Option( 58 | "--category", 59 | help="Disease category/type filter", 60 | ), 61 | ] = None, 62 | page_size: Annotated[ 63 | int, 64 | typer.Option( 65 | "--page-size", 66 | help="Number of results per page", 67 | min=1, 68 | max=100, 69 | ), 70 | ] = 20, 71 | page: Annotated[ 72 | int, 73 | typer.Option( 74 | "--page", 75 | help="Page number", 76 | min=1, 77 | ), 78 | ] = 1, 79 | api_key: Annotated[ 80 | str | None, 81 | typer.Option( 82 | "--api-key", 83 | help="NCI API key (overrides NCI_API_KEY env var)", 84 | envvar="NCI_API_KEY", 85 | ), 86 | ] = None, 87 | source: Annotated[ 88 | str, 89 | typer.Option( 90 | "--source", 91 | help="Data source: 'mydisease' (default) or 'nci'", 92 | show_choices=True, 93 | ), 94 | ] = "mydisease", 95 | ) -> None: 96 | """ 97 | Search for diseases in MyDisease.info or NCI CTS database. 98 | 99 | The NCI source provides controlled vocabulary of cancer conditions 100 | used in clinical trials, with official terms and synonyms. 101 | 102 | Examples: 103 | # Search MyDisease.info (default) 104 | biomcp disease search melanoma 105 | 106 | # Search NCI cancer terms 107 | biomcp disease search melanoma --source nci 108 | 109 | # Search without synonyms 110 | biomcp disease search "breast cancer" --no-synonyms --source nci 111 | 112 | # Filter by category 113 | biomcp disease search --category neoplasm --source nci 114 | """ 115 | if source == "nci": 116 | # Use NCI CTS API 117 | try: 118 | results = asyncio.run( 119 | search_diseases( 120 | name=name, 121 | include_synonyms=include_synonyms, 122 | category=category, 123 | page_size=page_size, 124 | page=page, 125 | api_key=api_key, 126 | ) 127 | ) 128 | 129 | output = format_disease_results(results) 130 | typer.echo(output) 131 | 132 | except CTSAPIError as e: 133 | if "API key required" in str(e): 134 | typer.echo(get_api_key_instructions()) 135 | else: 136 | typer.echo(f"Error: {e}", err=True) 137 | raise typer.Exit(1) from e 138 | except Exception as e: 139 | typer.echo(f"Unexpected error: {e}", err=True) 140 | raise typer.Exit(1) from e 141 | else: 142 | # Default to MyDisease.info 143 | # For now, just search by name 144 | if name: 145 | result = asyncio.run(get_disease(name)) 146 | typer.echo(result) 147 | else: 148 | typer.echo("Please provide a disease name to search for.") 149 | raise typer.Exit(1) 150 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_mcp_tools.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for MCP tool wrappers.""" 2 | 3 | import json 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | 8 | from biomcp.articles.search import _article_searcher 9 | 10 | 11 | class TestArticleSearcherMCPTool: 12 | """Test the _article_searcher MCP tool.""" 13 | 14 | @pytest.mark.asyncio 15 | async def test_article_searcher_with_all_params(self): 16 | """Test article_searcher with all parameters.""" 17 | mock_results = [{"title": "Test Article", "pmid": 12345}] 18 | 19 | with patch( 20 | "biomcp.articles.search_optimized.article_searcher_optimized" 21 | ) as mock_search: 22 | mock_search.return_value = json.dumps(mock_results) 23 | 24 | await _article_searcher( 25 | call_benefit="Testing search functionality", 26 | chemicals="aspirin,ibuprofen", 27 | diseases="cancer,diabetes", 28 | genes="BRAF,TP53", 29 | keywords="mutation,therapy", 30 | variants="V600E,R175H", 31 | include_preprints=True, 32 | ) 33 | 34 | # Verify the function was called 35 | mock_search.assert_called_once() 36 | 37 | # Check the parameters were passed correctly 38 | kwargs = mock_search.call_args[1] 39 | assert kwargs["call_benefit"] == "Testing search functionality" 40 | assert kwargs["chemicals"] == "aspirin,ibuprofen" 41 | assert kwargs["diseases"] == "cancer,diabetes" 42 | assert kwargs["genes"] == "BRAF,TP53" 43 | assert kwargs["keywords"] == "mutation,therapy" 44 | assert kwargs["variants"] == "V600E,R175H" 45 | assert kwargs["include_preprints"] is True 46 | assert kwargs.get("include_cbioportal", True) is True 47 | 48 | @pytest.mark.asyncio 49 | async def test_article_searcher_with_lists(self): 50 | """Test article_searcher with list inputs.""" 51 | with patch( 52 | "biomcp.articles.search_optimized.article_searcher_optimized" 53 | ) as mock_search: 54 | mock_search.return_value = "## Results" 55 | 56 | await _article_searcher( 57 | call_benefit="Testing with lists", 58 | chemicals=["drug1", "drug2"], 59 | diseases=["disease1"], 60 | genes=["GENE1"], 61 | include_preprints=False, 62 | ) 63 | 64 | # Check list parameters were passed correctly 65 | kwargs = mock_search.call_args[1] 66 | assert kwargs["call_benefit"] == "Testing with lists" 67 | assert kwargs["chemicals"] == ["drug1", "drug2"] 68 | assert kwargs["diseases"] == ["disease1"] 69 | assert kwargs["genes"] == ["GENE1"] 70 | assert kwargs["include_preprints"] is False 71 | 72 | @pytest.mark.asyncio 73 | async def test_article_searcher_minimal_params(self): 74 | """Test article_searcher with minimal parameters.""" 75 | with patch( 76 | "biomcp.articles.search_optimized.article_searcher_optimized" 77 | ) as mock_search: 78 | mock_search.return_value = "## No results" 79 | 80 | await _article_searcher(call_benefit="Minimal test") 81 | 82 | # Should still work with no search parameters 83 | kwargs = mock_search.call_args[1] 84 | assert kwargs["call_benefit"] == "Minimal test" 85 | assert kwargs.get("chemicals") is None 86 | assert kwargs.get("diseases") is None 87 | assert kwargs.get("genes") is None 88 | assert kwargs.get("keywords") is None 89 | assert kwargs.get("variants") is None 90 | 91 | @pytest.mark.asyncio 92 | async def test_article_searcher_empty_strings(self): 93 | """Test article_searcher with empty strings.""" 94 | with patch( 95 | "biomcp.articles.search_optimized.article_searcher_optimized" 96 | ) as mock_search: 97 | mock_search.return_value = "## Results" 98 | 99 | await _article_searcher( 100 | call_benefit="Empty string test", 101 | chemicals="", 102 | diseases="", 103 | genes="", 104 | ) 105 | 106 | # Empty strings are passed through 107 | kwargs = mock_search.call_args[1] 108 | assert kwargs["call_benefit"] == "Empty string test" 109 | assert kwargs["chemicals"] == "" 110 | assert kwargs["diseases"] == "" 111 | assert kwargs["genes"] == "" 112 | ``` -------------------------------------------------------------------------------- /docs/developer-guides/07-performance-optimizations.md: -------------------------------------------------------------------------------- ```markdown 1 | # Performance Optimizations 2 | 3 | This document describes the performance optimizations implemented in BioMCP to improve response times and throughput. 4 | 5 | ## Overview 6 | 7 | BioMCP has been optimized for high-performance biomedical data retrieval through several key improvements: 8 | 9 | - **65% faster test execution** (from ~120s to ~42s) 10 | - **Reduced API calls** through intelligent caching and batching 11 | - **Lower latency** via connection pooling and prefetching 12 | - **Better resource utilization** with parallel processing 13 | 14 | ## Key Optimizations 15 | 16 | ### 1. Connection Pooling 17 | 18 | HTTP connections are now reused across requests, eliminating connection establishment overhead. 19 | 20 | **Configuration:** 21 | 22 | - `BIOMCP_USE_CONNECTION_POOL` - Enable/disable pooling (default: "true") 23 | - Automatically manages pools per event loop 24 | - Graceful cleanup on shutdown 25 | 26 | **Impact:** ~30% reduction in request latency for sequential operations 27 | 28 | ### 2. Parallel Test Execution 29 | 30 | Tests now run in parallel using pytest-xdist, dramatically reducing test suite execution time. 31 | 32 | **Usage:** 33 | 34 | ```bash 35 | make test # Automatically uses parallel execution 36 | ``` 37 | 38 | **Impact:** ~5x faster test execution 39 | 40 | ### 3. Request Batching 41 | 42 | Multiple API requests are batched together when possible, particularly for cBioPortal queries. 43 | 44 | **Features:** 45 | 46 | - Automatic batching based on size/time thresholds 47 | - Configurable batch size (default: 5 for cBioPortal) 48 | - Error isolation per request 49 | 50 | **Impact:** Up to 80% reduction in API calls for bulk operations 51 | 52 | ### 4. Smart Caching 53 | 54 | Multiple caching layers optimize repeated queries: 55 | 56 | - **LRU Cache:** Memory-bounded caching for recent requests 57 | - **Hash-based keys:** 10x faster cache key generation 58 | - **Shared validation context:** Eliminates redundant gene/entity validations 59 | 60 | **Configuration:** 61 | 62 | - Cache size: 1000 entries (configurable) 63 | - TTL: 5-30 minutes depending on data type 64 | 65 | ### 5. Prefetching 66 | 67 | Common entities are prefetched on startup to warm caches: 68 | 69 | - Top genes: BRAF, EGFR, TP53, KRAS, etc. 70 | - Common diseases: lung cancer, breast cancer, etc. 71 | - Frequent chemicals: osimertinib, pembrolizumab, etc. 72 | 73 | **Impact:** First queries for common entities are instant 74 | 75 | ### 6. Pagination Support 76 | 77 | Europe PMC searches now use pagination for large result sets: 78 | 79 | - Optimal page size: 25 results 80 | - Progressive loading 81 | - Memory-efficient processing 82 | 83 | ### 7. Conditional Metrics 84 | 85 | Performance metrics are only collected when explicitly enabled, reducing overhead. 86 | 87 | **Configuration:** 88 | 89 | - `BIOMCP_METRICS_ENABLED` - Enable metrics (default: "false") 90 | 91 | ## Performance Benchmarks 92 | 93 | ### API Response Times 94 | 95 | | Operation | Before | After | Improvement | 96 | | ------------------------------ | ------ | ----- | ----------- | 97 | | Single gene search | 850ms | 320ms | 62% | 98 | | Bulk variant lookup | 4.2s | 1.1s | 74% | 99 | | Article search with cBioPortal | 2.1s | 780ms | 63% | 100 | 101 | ### Resource Usage 102 | 103 | | Metric | Before | After | Improvement | 104 | | ------------- | ------ | ----- | ----------- | 105 | | Memory (idle) | 145MB | 152MB | +5% | 106 | | Memory (peak) | 512MB | 385MB | -25% | 107 | | CPU (avg) | 35% | 28% | -20% | 108 | 109 | ## Best Practices 110 | 111 | 1. **Keep connection pooling enabled** unless experiencing issues 112 | 2. **Use the unified search** methods to benefit from parallel execution 113 | 3. **Batch operations** when performing multiple lookups 114 | 4. **Monitor cache hit rates** in production environments 115 | 116 | ## Troubleshooting 117 | 118 | ### Connection Pool Issues 119 | 120 | If experiencing connection errors: 121 | 122 | 1. Disable pooling: `export BIOMCP_USE_CONNECTION_POOL=false` 123 | 2. Check for firewall/proxy issues 124 | 3. Verify SSL certificates 125 | 126 | ### Memory Usage 127 | 128 | If memory usage is high: 129 | 130 | 1. Reduce cache size in `request_cache.py` 131 | 2. Lower connection pool limits 132 | 3. Disable prefetching by removing the lifespan hook 133 | 134 | ### Performance Regression 135 | 136 | To identify performance issues: 137 | 138 | 1. Enable metrics: `export BIOMCP_METRICS_ENABLED=true` 139 | 2. Check slow operations in logs 140 | 3. Profile with `py-spy` or similar tools 141 | 142 | ## Future Optimizations 143 | 144 | Planned improvements include: 145 | 146 | - GraphQL batching for complex queries 147 | - Redis integration for distributed caching 148 | - WebSocket support for real-time updates 149 | - GPU acceleration for variant analysis 150 | ``` -------------------------------------------------------------------------------- /docs/tutorials/remote-connection.md: -------------------------------------------------------------------------------- ```markdown 1 | # Connecting to Remote BioMCP 2 | 3 | This guide walks you through connecting Claude to the remote BioMCP server, providing instant access to biomedical research tools without any local installation. 4 | 5 | ## Overview 6 | 7 | The remote BioMCP server (https://remote.biomcp.org/mcp) provides cloud-hosted access to all BioMCP tools. This eliminates the need for local installation while maintaining full functionality. 8 | 9 | !!! success "Benefits of Remote Connection" - **No Installation Required**: Start using BioMCP immediately - **Always Up-to-Date**: Automatically receive the latest features and improvements - **Cloud-Powered**: Leverage server-side resources for faster searches - **Secure Authentication**: Uses Google OAuth for secure access 10 | 11 | !!! info "Privacy Notice" 12 | We log user emails and queries to improve the service. All data is handled according to our privacy policy. 13 | 14 | ## Step-by-Step Setup 15 | 16 | ### Step 1: Access Custom Connectors 17 | 18 | Navigate to the **Custom Connectors** section in your Claude interface. This is where you'll configure the connection to BioMCP. 19 | 20 |  21 | 22 | ### Step 2: Add Custom Connector 23 | 24 | Click the **Add Custom Connector** button and enter the following details: 25 | 26 | - **Name**: BioMCP 27 | - **URL**: `https://remote.biomcp.org/mcp` 28 | 29 |  30 | 31 | ### Step 3: Verify Connector is Enabled 32 | 33 | After adding, you should see BioMCP listed with an "Enabled" status. This confirms the connector was added successfully. 34 | 35 |  36 | 37 | ### Step 4: Connect to BioMCP 38 | 39 | Return to the main Connectors section where you'll now see BioMCP available for connection. Click the **Connect** button. 40 | 41 |  42 | 43 | ### Step 5: Authenticate with Google 44 | 45 | You'll be redirected to Google OAuth for authentication. Sign in with any valid Google account. This step ensures secure access to the service. 46 | 47 |  48 | 49 | !!! note "Authentication" - Any valid Google account works - Your email is logged for service improvement - Authentication is handled securely through Google OAuth 50 | 51 | ### Step 6: Connection Success 52 | 53 | Once authenticated, you'll see a successful connection message displaying the available tool count. As of January 2025, there are 23 tools available (this number may increase as new features are added). 54 | 55 |  56 | 57 | ## Verifying Your Connection 58 | 59 | After successful connection, you can verify BioMCP is working by asking Claude: 60 | 61 | ``` 62 | What tools do you have available from BioMCP? 63 | ``` 64 | 65 | Claude should list the available tools including: 66 | 67 | - Article search and retrieval (PubMed/PubTator3) 68 | - Clinical trials search (ClinicalTrials.gov and NCI) 69 | - Variant analysis (MyVariant.info) 70 | - Gene, drug, and disease information 71 | - Sequential thinking for complex research 72 | 73 | ## Troubleshooting 74 | 75 | ### Connection Failed 76 | 77 | - Ensure you entered the URL exactly as shown: `https://remote.biomcp.org/mcp` 78 | - Check your internet connection 79 | - Try disconnecting and reconnecting 80 | 81 | ### Authentication Issues 82 | 83 | - Make sure you're using a valid Google account 84 | - Clear your browser cache if authentication hangs 85 | - Try using a different browser if issues persist 86 | 87 | ### Tools Not Available 88 | 89 | - Disconnect and reconnect to BioMCP 90 | - Refresh your Claude session 91 | - Contact support if tools remain unavailable 92 | 93 | ## Next Steps 94 | 95 | Now that you're connected to BioMCP, you can: 96 | 97 | 1. **Search biomedical literature**: "Find recent papers on BRAF mutations in melanoma" 98 | 2. **Analyze clinical trials**: "What trials are recruiting for lung cancer with EGFR mutations?" 99 | 3. **Interpret variants**: "What is the clinical significance of TP53 p.R273H?" 100 | 4. **Explore drug information**: "Tell me about pembrolizumab's mechanism and indications" 101 | 102 | ## Support 103 | 104 | For issues or questions about the remote BioMCP connection: 105 | 106 | - GitHub Issues: [https://github.com/genomoncology/biomcp/issues](https://github.com/genomoncology/biomcp/issues) 107 | - Documentation: [https://biomcp.org](https://biomcp.org) 108 | ``` -------------------------------------------------------------------------------- /tests/config/test_smithery_config.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python 2 | """ 3 | Test script to validate Smithery configuration against actual function implementations. 4 | This script checks that the schema definitions in smithery.yaml match the expected 5 | function parameters in your codebase. 6 | """ 7 | 8 | import os 9 | from typing import Any 10 | 11 | import pytest 12 | import yaml 13 | from pydantic import BaseModel 14 | 15 | from biomcp.articles.search import PubmedRequest 16 | 17 | # Import the functions we want to test 18 | from biomcp.trials.search import TrialQuery 19 | from biomcp.variants.search import VariantQuery 20 | 21 | 22 | @pytest.fixture 23 | def smithery_config(): 24 | """Load the Smithery configuration.""" 25 | # Get the project root directory 26 | project_root = os.path.abspath( 27 | os.path.join(os.path.dirname(__file__), "../..") 28 | ) 29 | config_path = os.path.join(project_root, "smithery.yaml") 30 | 31 | with open(config_path) as f: 32 | return yaml.safe_load(f) 33 | 34 | 35 | def test_smithery_config(smithery_config): 36 | """Test that all tool schemas in smithery.yaml match the expected function parameters.""" 37 | # Functions to test and their expected parameter types 38 | functions_to_test = { 39 | "trial_searcher": {"param_name": "query", "expected_type": TrialQuery}, 40 | "variant_searcher": { 41 | "param_name": "query", 42 | "expected_type": VariantQuery, 43 | }, 44 | "article_searcher": { 45 | "param_name": "query", 46 | "expected_type": PubmedRequest, 47 | }, 48 | "trial_protocol": {"param_name": "nct_id", "expected_type": str}, 49 | "trial_locations": {"param_name": "nct_id", "expected_type": str}, 50 | "trial_outcomes": {"param_name": "nct_id", "expected_type": str}, 51 | "trial_references": {"param_name": "nct_id", "expected_type": str}, 52 | "article_details": {"param_name": "pmid", "expected_type": str}, 53 | "variant_details": {"param_name": "variant_id", "expected_type": str}, 54 | } 55 | 56 | for tool_name, param_info in functions_to_test.items(): 57 | validate_tool_schema(smithery_config, tool_name, param_info) 58 | 59 | 60 | def validate_tool_schema( 61 | smithery_config, tool_name: str, param_info: dict[str, Any] 62 | ): 63 | """Validate that the tool schema in smithery.yaml matches the expected function parameter.""" 64 | param_name = param_info["param_name"] 65 | expected_type = param_info["expected_type"] 66 | 67 | # Check if the tool is defined in the smithery.yaml 68 | assert tool_name in smithery_config.get( 69 | "tools", {} 70 | ), f"Tool '{tool_name}' is not defined in smithery.yaml" 71 | 72 | tool_config = smithery_config["tools"][tool_name] 73 | 74 | # Check if the tool has an input schema 75 | assert ( 76 | "input" in tool_config 77 | ), f"Tool '{tool_name}' does not have an input schema defined" 78 | 79 | input_schema = tool_config["input"].get("schema", {}) 80 | 81 | # Check if the parameter is required 82 | if issubclass(expected_type, BaseModel): 83 | # For complex types like TrialQuery, check if 'query' is required 84 | assert ( 85 | "required" in input_schema 86 | ), f"Tool '{tool_name}' does not have required parameters specified" 87 | assert ( 88 | "query" in input_schema.get("required", []) 89 | ), f"Parameter 'query' for tool '{tool_name}' is not marked as required" 90 | else: 91 | assert ( 92 | "required" in input_schema 93 | ), f"Tool '{tool_name}' does not have required parameters specified" 94 | assert ( 95 | param_name in input_schema.get("required", []) 96 | ), f"Parameter '{param_name}' for tool '{tool_name}' is not marked as required" 97 | 98 | # For complex types (Pydantic models), check if the schema references the correct type 99 | if issubclass(expected_type, BaseModel): 100 | properties = input_schema.get("properties", {}) 101 | assert ( 102 | "query" in properties 103 | ), f"Tool '{tool_name}' does not have a 'query' property defined" 104 | 105 | query_prop = properties["query"] 106 | assert ( 107 | "$ref" in query_prop 108 | ), f"Tool '{tool_name}' query property does not reference a schema" 109 | 110 | schema_ref = query_prop["$ref"] 111 | expected_schema_name = expected_type.__name__ 112 | assert schema_ref.endswith( 113 | expected_schema_name 114 | ), f"Tool '{tool_name}' references incorrect schema: {schema_ref}, expected: {expected_schema_name}" 115 | ``` -------------------------------------------------------------------------------- /scripts/check_http_imports.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 2 | """Check for direct HTTP library imports outside of allowed files.""" 3 | 4 | import ast 5 | import sys 6 | from pathlib import Path 7 | 8 | # HTTP libraries to check for 9 | HTTP_LIBRARIES = { 10 | "httpx", 11 | "aiohttp", 12 | "requests", 13 | "urllib3", 14 | } # Note: urllib is allowed for URL parsing 15 | 16 | # Files allowed to import HTTP libraries 17 | ALLOWED_FILES = { 18 | "http_client.py", 19 | "http_client_simple.py", 20 | "http_client_test.py", 21 | "test_http_client.py", 22 | "connection_pool.py", # Connection pooling infrastructure 23 | } 24 | 25 | # Additional allowed patterns (for version checks, etc.) 26 | ALLOWED_PATTERNS = { 27 | # Allow httpx import just for version check 28 | ("health.py", "httpx"): "version check only", 29 | } 30 | 31 | 32 | def _check_import_node( 33 | node: ast.Import, file_name: str 34 | ) -> set[tuple[str, int]]: 35 | """Check ast.Import node for violations.""" 36 | violations = set() 37 | for alias in node.names: 38 | module_name = alias.name.split(".")[0] 39 | if module_name in HTTP_LIBRARIES: 40 | pattern_key = (file_name, module_name) 41 | if pattern_key not in ALLOWED_PATTERNS: 42 | violations.add((module_name, node.lineno)) 43 | return violations 44 | 45 | 46 | def _check_import_from_node( 47 | node: ast.ImportFrom, file_name: str 48 | ) -> set[tuple[str, int]]: 49 | """Check ast.ImportFrom node for violations.""" 50 | violations = set() 51 | if node.module: 52 | module_name = node.module.split(".")[0] 53 | if module_name in HTTP_LIBRARIES: 54 | pattern_key = (file_name, module_name) 55 | if pattern_key not in ALLOWED_PATTERNS: 56 | violations.add((module_name, node.lineno)) 57 | return violations 58 | 59 | 60 | def check_imports(file_path: Path) -> set[tuple[str, int]]: 61 | """Check a Python file for HTTP library imports. 62 | 63 | Returns set of (library, line_number) tuples for violations. 64 | """ 65 | violations = set() 66 | 67 | # Check if this file is allowed 68 | if file_path.name in ALLOWED_FILES: 69 | return violations 70 | 71 | try: 72 | with open(file_path, encoding="utf-8") as f: 73 | content = f.read() 74 | 75 | tree = ast.parse(content) 76 | 77 | for node in ast.walk(tree): 78 | if isinstance(node, ast.Import): 79 | violations.update(_check_import_node(node, file_path.name)) 80 | elif isinstance(node, ast.ImportFrom): 81 | violations.update( 82 | _check_import_from_node(node, file_path.name) 83 | ) 84 | 85 | except Exception as e: 86 | print(f"Error parsing {file_path}: {e}", file=sys.stderr) 87 | 88 | return violations 89 | 90 | 91 | def find_python_files(root_dir: Path) -> list[Path]: 92 | """Find all Python files in the project.""" 93 | python_files = [] 94 | 95 | for path in root_dir.rglob("*.py"): 96 | # Skip virtual environments, cache, etc. 97 | if any( 98 | part.startswith(".") 99 | or part in ["__pycache__", "venv", "env", ".tox"] 100 | for part in path.parts 101 | ): 102 | continue 103 | python_files.append(path) 104 | 105 | return python_files 106 | 107 | 108 | def main(): 109 | """Main function to check all Python files.""" 110 | # Get project root (parent of scripts directory) 111 | script_dir = Path(__file__).parent 112 | project_root = script_dir.parent 113 | src_dir = project_root / "src" 114 | 115 | # Find all Python files 116 | python_files = find_python_files(src_dir) 117 | 118 | all_violations = [] 119 | 120 | for file_path in python_files: 121 | violations = check_imports(file_path) 122 | if violations: 123 | for lib, line in violations: 124 | all_violations.append((file_path, lib, line)) 125 | 126 | if all_violations: 127 | print("❌ Found direct HTTP library imports:\n") 128 | for file_path, lib, line in sorted(all_violations): 129 | rel_path = file_path.relative_to(project_root) 130 | print(f" {rel_path}:{line} - imports '{lib}'") 131 | 132 | print(f"\n❌ Total violations: {len(all_violations)}") 133 | print( 134 | "\nPlease use the centralized HTTP client (biomcp.http_client) instead." 135 | ) 136 | print( 137 | "If you need to add an exception, update ALLOWED_FILES or ALLOWED_PATTERNS in this script." 138 | ) 139 | return 1 140 | else: 141 | print("✅ No direct HTTP library imports found outside allowed files.") 142 | return 0 143 | 144 | 145 | if __name__ == "__main__": 146 | sys.exit(main()) 147 | ``` -------------------------------------------------------------------------------- /src/biomcp/variants/cbioportal_search_helpers.py: -------------------------------------------------------------------------------- ```python 1 | """Helper functions for cBioPortal search to reduce complexity.""" 2 | 3 | import logging 4 | import re 5 | from typing import Any 6 | 7 | from .cbioportal_search import GeneHotspot 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | async def process_mutation_results( 13 | mutation_results: list[tuple[Any, str]], 14 | cancer_types_lookup: dict[str, dict[str, Any]], 15 | client: Any, 16 | ) -> dict[str, Any]: 17 | """Process mutation results from multiple studies. 18 | 19 | Args: 20 | mutation_results: List of (result, study_id) tuples 21 | cancer_types_lookup: Cancer type lookup dictionary 22 | client: Client instance for API calls 23 | 24 | Returns: 25 | Dictionary with aggregated mutation data 26 | """ 27 | total_mutations = 0 28 | total_samples = 0 29 | hotspot_counts: dict[str, dict[str, Any]] = {} 30 | cancer_distribution: dict[str, int] = {} 31 | studies_with_data = 0 32 | 33 | for result, study_id in mutation_results: 34 | if isinstance(result, Exception): 35 | logger.debug(f"Failed to get mutations for {study_id}: {result}") 36 | continue 37 | 38 | if result and "mutations" in result: 39 | mutations = result["mutations"] 40 | sample_count = result["sample_count"] 41 | 42 | if mutations: 43 | studies_with_data += 1 44 | # Count unique samples with mutations 45 | unique_samples = { 46 | m.get("sampleId") for m in mutations if m.get("sampleId") 47 | } 48 | total_mutations += len(unique_samples) 49 | total_samples += sample_count 50 | 51 | # Process mutations for hotspots and cancer types 52 | study_cancer_type = await client._get_study_cancer_type( 53 | study_id, cancer_types_lookup 54 | ) 55 | _update_hotspot_counts( 56 | mutations, hotspot_counts, study_cancer_type 57 | ) 58 | _update_cancer_distribution( 59 | mutations, cancer_distribution, study_cancer_type 60 | ) 61 | 62 | return { 63 | "total_mutations": total_mutations, 64 | "total_samples": total_samples, 65 | "studies_with_data": studies_with_data, 66 | "hotspot_counts": hotspot_counts, 67 | "cancer_distribution": cancer_distribution, 68 | } 69 | 70 | 71 | def _update_hotspot_counts( 72 | mutations: list[dict[str, Any]], 73 | hotspot_counts: dict[str, dict[str, Any]], 74 | cancer_type: str, 75 | ) -> None: 76 | """Update hotspot counts from mutations.""" 77 | for mut in mutations: 78 | protein_change = mut.get("proteinChange", "") 79 | if protein_change: 80 | if protein_change not in hotspot_counts: 81 | hotspot_counts[protein_change] = { 82 | "count": 0, 83 | "cancer_types": set(), 84 | } 85 | hotspot_counts[protein_change]["count"] += 1 86 | hotspot_counts[protein_change]["cancer_types"].add(cancer_type) 87 | 88 | 89 | def _update_cancer_distribution( 90 | mutations: list[dict[str, Any]], 91 | cancer_distribution: dict[str, int], 92 | cancer_type: str, 93 | ) -> None: 94 | """Update cancer type distribution.""" 95 | cancer_distribution[cancer_type] = cancer_distribution.get( 96 | cancer_type, 0 97 | ) + len({m.get("sampleId") for m in mutations if m.get("sampleId")}) 98 | 99 | 100 | def format_hotspots( 101 | hotspot_counts: dict[str, dict[str, Any]], total_mutations: int 102 | ) -> list[GeneHotspot]: 103 | """Format hotspot counts into GeneHotspot objects.""" 104 | hotspots = [] 105 | 106 | for protein_change, data in sorted( 107 | hotspot_counts.items(), key=lambda x: x[1]["count"], reverse=True 108 | )[:5]: # Top 5 hotspots 109 | # Try to extract position from protein change 110 | position = 0 111 | try: 112 | match = re.search(r"(\d+)", protein_change) 113 | if match: 114 | position = int(match.group(1)) 115 | except Exception: 116 | logger.debug("Failed to extract position from protein change") 117 | 118 | hotspots.append( 119 | GeneHotspot( 120 | position=position, 121 | amino_acid_change=protein_change, 122 | count=data["count"], 123 | frequency=data["count"] / total_mutations 124 | if total_mutations > 0 125 | else 0.0, 126 | cancer_types=list(data["cancer_types"]), 127 | ) 128 | ) 129 | 130 | return hotspots 131 | ``` -------------------------------------------------------------------------------- /tests/tdd/workers/test_worker_sanitization.js: -------------------------------------------------------------------------------- ```javascript 1 | /** 2 | * Tests for worker_entry_stytch.js sanitization functionality 3 | */ 4 | 5 | const { test } = require("node:test"); 6 | const assert = require("node:assert"); 7 | 8 | // Mock the sanitizeObject function for testing 9 | const SENSITIVE_FIELDS = [ 10 | "api_key", 11 | "apiKey", 12 | "api-key", 13 | "token", 14 | "secret", 15 | "password", 16 | ]; 17 | 18 | const sanitizeObject = (obj) => { 19 | if (!obj || typeof obj !== "object") return obj; 20 | 21 | // Handle arrays 22 | if (Array.isArray(obj)) { 23 | return obj.map((item) => sanitizeObject(item)); 24 | } 25 | 26 | // Handle objects 27 | const sanitized = {}; 28 | for (const [key, value] of Object.entries(obj)) { 29 | // Check if this key is sensitive 30 | const lowerKey = key.toLowerCase(); 31 | if ( 32 | SENSITIVE_FIELDS.some((field) => lowerKey.includes(field.toLowerCase())) 33 | ) { 34 | sanitized[key] = "[REDACTED]"; 35 | } else if (typeof value === "object" && value !== null) { 36 | // Recursively sanitize nested objects 37 | sanitized[key] = sanitizeObject(value); 38 | } else { 39 | sanitized[key] = value; 40 | } 41 | } 42 | return sanitized; 43 | }; 44 | 45 | // Test cases 46 | test("should redact api_key field", () => { 47 | const input = { 48 | params: { 49 | arguments: { 50 | api_key: "AIzaSyB1234567890", 51 | gene: "BRAF", 52 | position: 140753336, 53 | }, 54 | }, 55 | }; 56 | 57 | const result = sanitizeObject(input); 58 | assert.strictEqual(result.params.arguments.api_key, "[REDACTED]"); 59 | assert.strictEqual(result.params.arguments.gene, "BRAF"); 60 | assert.strictEqual(result.params.arguments.position, 140753336); 61 | }); 62 | 63 | test("should handle nested sensitive fields", () => { 64 | const input = { 65 | outer: { 66 | token: "secret-token", 67 | inner: { 68 | password: "my-password", 69 | apiKey: "another-key", 70 | safe_field: "visible", 71 | }, 72 | }, 73 | }; 74 | 75 | const result = sanitizeObject(input); 76 | assert.strictEqual(result.outer.token, "[REDACTED]"); 77 | assert.strictEqual(result.outer.inner.password, "[REDACTED]"); 78 | assert.strictEqual(result.outer.inner.apiKey, "[REDACTED]"); 79 | assert.strictEqual(result.outer.inner.safe_field, "visible"); 80 | }); 81 | 82 | test("should handle arrays with sensitive data", () => { 83 | const input = { 84 | requests: [ 85 | { api_key: "key1", data: "safe" }, 86 | { api_key: "key2", data: "also safe" }, 87 | ], 88 | }; 89 | 90 | const result = sanitizeObject(input); 91 | assert.strictEqual(result.requests[0].api_key, "[REDACTED]"); 92 | assert.strictEqual(result.requests[1].api_key, "[REDACTED]"); 93 | assert.strictEqual(result.requests[0].data, "safe"); 94 | assert.strictEqual(result.requests[1].data, "also safe"); 95 | }); 96 | 97 | test("should be case-insensitive for field names", () => { 98 | const input = { 99 | API_KEY: "uppercase", 100 | Api_Key: "mixed", 101 | "api-key": "hyphenated", 102 | }; 103 | 104 | const result = sanitizeObject(input); 105 | assert.strictEqual(result.API_KEY, "[REDACTED]"); 106 | assert.strictEqual(result.Api_Key, "[REDACTED]"); 107 | assert.strictEqual(result["api-key"], "[REDACTED]"); 108 | }); 109 | 110 | test("should not modify non-sensitive fields", () => { 111 | const input = { 112 | gene: "TP53", 113 | chromosome: "chr17", 114 | position: 7577121, 115 | reference: "C", 116 | alternate: "T", 117 | }; 118 | 119 | const result = sanitizeObject(input); 120 | assert.deepStrictEqual(result, input); 121 | }); 122 | 123 | test("should handle null and undefined values", () => { 124 | const input = { 125 | api_key: null, 126 | token: undefined, 127 | valid: "data", 128 | }; 129 | 130 | const result = sanitizeObject(input); 131 | assert.strictEqual(result.api_key, "[REDACTED]"); 132 | assert.strictEqual(result.token, "[REDACTED]"); 133 | assert.strictEqual(result.valid, "data"); 134 | }); 135 | 136 | test("should handle think tool detection", () => { 137 | const thinkRequest = { 138 | params: { 139 | name: "think", 140 | arguments: { 141 | thought: "Analyzing the problem...", 142 | thoughtNumber: 1, 143 | }, 144 | }, 145 | }; 146 | 147 | const toolName = thinkRequest.params?.name; 148 | assert.strictEqual(toolName, "think"); 149 | }); 150 | 151 | test("should handle domain-based filtering", () => { 152 | const searchRequest1 = { 153 | params: { 154 | name: "search", 155 | arguments: { 156 | domain: "thinking", 157 | query: "some query", 158 | }, 159 | }, 160 | }; 161 | 162 | const searchRequest2 = { 163 | params: { 164 | name: "search", 165 | arguments: { 166 | domain: "think", 167 | query: "some query", 168 | }, 169 | }, 170 | }; 171 | 172 | const domain1 = searchRequest1.params?.arguments?.domain; 173 | const domain2 = searchRequest2.params?.arguments?.domain; 174 | 175 | assert.ok(domain1 === "thinking" || domain1 === "think"); 176 | assert.ok(domain2 === "thinking" || domain2 === "think"); 177 | }); 178 | ``` -------------------------------------------------------------------------------- /src/biomcp/cli/interventions.py: -------------------------------------------------------------------------------- ```python 1 | """CLI commands for intervention search and lookup.""" 2 | 3 | import asyncio 4 | from typing import Annotated 5 | 6 | import typer 7 | 8 | from ..integrations.cts_api import CTSAPIError, get_api_key_instructions 9 | from ..interventions import get_intervention, search_interventions 10 | from ..interventions.getter import format_intervention_details 11 | from ..interventions.search import ( 12 | INTERVENTION_TYPES, 13 | format_intervention_results, 14 | ) 15 | 16 | intervention_app = typer.Typer( 17 | no_args_is_help=True, 18 | help="Search and retrieve intervention information from NCI CTS API", 19 | ) 20 | 21 | 22 | @intervention_app.command("search") 23 | def search_interventions_cli( 24 | name: Annotated[ 25 | str | None, 26 | typer.Argument( 27 | help="Intervention name to search for (partial match supported)" 28 | ), 29 | ] = None, 30 | intervention_type: Annotated[ 31 | str | None, 32 | typer.Option( 33 | "--type", 34 | help=f"Type of intervention. Options: {', '.join(INTERVENTION_TYPES)}", 35 | show_choices=True, 36 | ), 37 | ] = None, 38 | synonyms: Annotated[ 39 | bool, 40 | typer.Option( 41 | "--synonyms/--no-synonyms", 42 | help="Include synonym matches in search", 43 | ), 44 | ] = True, 45 | page_size: Annotated[ 46 | int, 47 | typer.Option( 48 | "--page-size", 49 | help="Number of results per page", 50 | min=1, 51 | max=100, 52 | ), 53 | ] = 20, 54 | page: Annotated[ 55 | int, 56 | typer.Option( 57 | "--page", 58 | help="Page number", 59 | min=1, 60 | ), 61 | ] = 1, 62 | api_key: Annotated[ 63 | str | None, 64 | typer.Option( 65 | "--api-key", 66 | help="NCI API key (overrides NCI_API_KEY env var)", 67 | envvar="NCI_API_KEY", 68 | ), 69 | ] = None, 70 | ) -> None: 71 | """ 72 | Search for interventions (drugs, devices, procedures) in the NCI database. 73 | 74 | Examples: 75 | # Search by drug name 76 | biomcp intervention search pembrolizumab 77 | 78 | # Search by type 79 | biomcp intervention search --type Drug 80 | 81 | # Search for devices 82 | biomcp intervention search "CAR T" --type Biological 83 | 84 | # Search without synonyms 85 | biomcp intervention search imatinib --no-synonyms 86 | """ 87 | try: 88 | results = asyncio.run( 89 | search_interventions( 90 | name=name, 91 | intervention_type=intervention_type, 92 | synonyms=synonyms, 93 | page_size=page_size, 94 | page=page, 95 | api_key=api_key, 96 | ) 97 | ) 98 | 99 | output = format_intervention_results(results) 100 | typer.echo(output) 101 | 102 | except CTSAPIError as e: 103 | if "API key required" in str(e): 104 | typer.echo(get_api_key_instructions()) 105 | else: 106 | typer.echo(f"Error: {e}", err=True) 107 | raise typer.Exit(1) from e 108 | except Exception as e: 109 | typer.echo(f"Unexpected error: {e}", err=True) 110 | raise typer.Exit(1) from e 111 | 112 | 113 | @intervention_app.command("get") 114 | def get_intervention_cli( 115 | intervention_id: Annotated[ 116 | str, 117 | typer.Argument(help="Intervention ID"), 118 | ], 119 | api_key: Annotated[ 120 | str | None, 121 | typer.Option( 122 | "--api-key", 123 | help="NCI API key (overrides NCI_API_KEY env var)", 124 | envvar="NCI_API_KEY", 125 | ), 126 | ] = None, 127 | ) -> None: 128 | """ 129 | Get detailed information about a specific intervention. 130 | 131 | Example: 132 | biomcp intervention get INT123456 133 | """ 134 | try: 135 | intervention_data = asyncio.run( 136 | get_intervention( 137 | intervention_id=intervention_id, 138 | api_key=api_key, 139 | ) 140 | ) 141 | 142 | output = format_intervention_details(intervention_data) 143 | typer.echo(output) 144 | 145 | except CTSAPIError as e: 146 | if "API key required" in str(e): 147 | typer.echo(get_api_key_instructions()) 148 | else: 149 | typer.echo(f"Error: {e}", err=True) 150 | raise typer.Exit(1) from e 151 | except Exception as e: 152 | typer.echo(f"Unexpected error: {e}", err=True) 153 | raise typer.Exit(1) from e 154 | 155 | 156 | @intervention_app.command("types") 157 | def list_intervention_types() -> None: 158 | """ 159 | List all available intervention types. 160 | """ 161 | typer.echo("## Available Intervention Types\n") 162 | for int_type in INTERVENTION_TYPES: 163 | typer.echo(f"- {int_type}") 164 | typer.echo("\nUse these values with the --type option when searching.") 165 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_pten_r173_search.py: -------------------------------------------------------------------------------- ```python 1 | """Test case demonstrating PTEN R173 search limitations.""" 2 | 3 | import asyncio 4 | import json 5 | 6 | import pytest 7 | 8 | from biomcp.articles.search import PubmedRequest, search_articles 9 | 10 | 11 | @pytest.mark.asyncio 12 | async def test_pten_r173_search_limitations(): 13 | """Demonstrate that current AND logic is too restrictive for finding PTEN R173 papers.""" 14 | 15 | # Test 1: Current approach with multiple keywords 16 | request_restrictive = PubmedRequest( 17 | genes=["PTEN"], keywords=["R173", "Arg173"] 18 | ) 19 | result_restrictive = await search_articles( 20 | request_restrictive, output_json=True 21 | ) 22 | data_restrictive = json.loads(result_restrictive) 23 | 24 | # Test 2: Less restrictive approach 25 | request_less_restrictive = PubmedRequest(genes=["PTEN"], keywords=["R173"]) 26 | result_less_restrictive = await search_articles( 27 | request_less_restrictive, output_json=True 28 | ) 29 | data_less_restrictive = json.loads(result_less_restrictive) 30 | 31 | # Test 3: Alternative variant notations 32 | request_notation = PubmedRequest(genes=["PTEN"], keywords=["p.R173C"]) 33 | result_notation = await search_articles(request_notation, output_json=True) 34 | data_notation = json.loads(result_notation) 35 | 36 | print("\nPTEN R173 Search Results:") 37 | print( 38 | f"1. PTEN + R173 + Arg173 (AND logic): {len(data_restrictive)} articles" 39 | ) 40 | print(f"2. PTEN + R173 only: {len(data_less_restrictive)} articles") 41 | print(f"3. PTEN + p.R173C: {len(data_notation)} articles") 42 | 43 | # The restrictive search should find fewer results 44 | assert len(data_restrictive) <= len(data_less_restrictive) 45 | 46 | # Show some example articles found 47 | if data_less_restrictive: 48 | print("\nExample articles found with 'PTEN + R173':") 49 | for i, article in enumerate(data_less_restrictive[:5]): 50 | title = article.get("title", "No title") 51 | pmid = article.get("pmid", "N/A") 52 | year = article.get("pub_year", article.get("date", "N/A")) 53 | print(f"{i + 1}. {title[:80]}... (PMID: {pmid}, Year: {year[:4]})") 54 | 55 | 56 | @pytest.mark.asyncio 57 | async def test_specific_pten_papers_not_found(): 58 | """Test that specific PTEN R173 papers mentioned by user are not found.""" 59 | 60 | # Papers mentioned by user that should be found 61 | expected_papers = [ 62 | "Mester et al 2018 Human Mutation", 63 | "Mighell et al 2020 AJHG", 64 | "Smith et al 2016 Proteins", 65 | "Smith et al 2019 AJHG", 66 | "Smith et al 2023 JPCB", 67 | ] 68 | 69 | # Search for Smith IN papers on PTEN 70 | request = PubmedRequest(keywords=["Smith IN", "PTEN"]) 71 | result = await search_articles(request, output_json=True) 72 | data = json.loads(result) 73 | 74 | print(f"\nSmith IN + PTEN search found {len(data)} articles") 75 | 76 | # Check if any contain R173 in title/abstract 77 | r173_papers = [] 78 | for article in data: 79 | title = article.get("title", "") 80 | abstract = article.get("abstract", "") 81 | if ( 82 | "R173" in title 83 | or "R173" in abstract 84 | or "Arg173" in title 85 | or "Arg173" in abstract 86 | ): 87 | r173_papers.append(article) 88 | 89 | print(f"Papers mentioning R173/Arg173: {len(r173_papers)}") 90 | 91 | # The issue: R173 might only be in full text, not abstract 92 | assert len(r173_papers) < len( 93 | expected_papers 94 | ), "Not all expected R173 papers are found" 95 | 96 | 97 | def test_and_logic_explanation(): 98 | """Document why AND logic causes issues for variant searches.""" 99 | 100 | explanation = """ 101 | Current search behavior: 102 | - Query: genes=['PTEN'], keywords=['R173', 'Arg173'] 103 | - Translates to: "@GENE_PTEN AND R173 AND Arg173" 104 | - This requires ALL terms to be present 105 | 106 | Issues: 107 | 1. Papers may use either "R173" OR "Arg173", not both 108 | 2. Variant notations vary: "R173C", "p.R173C", "c.517C>T", etc. 109 | 3. Specific mutation details may only be in full text, not abstract 110 | 4. AND logic is too restrictive for synonym/variant searches 111 | 112 | Potential solutions: 113 | 1. Implement OR logic within variant/keyword groups 114 | 2. Add variant notation normalization 115 | 3. Support multiple search strategies (AND vs OR) 116 | 4. Consider full-text search capabilities 117 | """ 118 | 119 | print(explanation) 120 | assert True # This test is for documentation 121 | 122 | 123 | if __name__ == "__main__": 124 | # Run the tests to demonstrate the issue 125 | asyncio.run(test_pten_r173_search_limitations()) 126 | asyncio.run(test_specific_pten_papers_not_found()) 127 | test_and_logic_explanation() 128 | ```