This is page 4 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /src/biomcp/connection_pool.py: -------------------------------------------------------------------------------- ```python 1 | """Connection pool manager with proper event loop lifecycle management. 2 | 3 | This module provides HTTP connection pooling that is properly integrated 4 | with asyncio event loops. It ensures that connection pools are: 5 | - Created per event loop to avoid cross-loop usage 6 | - Automatically cleaned up when event loops are garbage collected 7 | - Reused across requests for better performance 8 | 9 | Key Features: 10 | - Event loop isolation - each loop gets its own pools 11 | - Weak references prevent memory leaks 12 | - Automatic cleanup on loop destruction 13 | - Thread-safe pool management 14 | 15 | Example: 16 | ```python 17 | # Get a connection pool for the current event loop 18 | pool = await get_connection_pool(verify=True, timeout=httpx.Timeout(30)) 19 | 20 | # Use the pool for multiple requests (no need to close) 21 | response = await pool.get("https://api.example.com/data") 22 | ``` 23 | 24 | Environment Variables: 25 | BIOMCP_USE_CONNECTION_POOL: Enable/disable pooling (default: "true") 26 | """ 27 | 28 | import asyncio 29 | import ssl 30 | import weakref 31 | 32 | # NOTE: httpx import is allowed in this file for connection pooling infrastructure 33 | import httpx 34 | 35 | 36 | class EventLoopConnectionPools: 37 | """Manages connection pools per event loop. 38 | 39 | This class ensures that each asyncio event loop has its own set of 40 | connection pools, preventing cross-loop contamination and ensuring 41 | proper cleanup when event loops are destroyed. 42 | 43 | Attributes: 44 | _loop_pools: Weak key dictionary mapping event loops to their pools 45 | _lock: Asyncio lock for thread-safe pool creation 46 | """ 47 | 48 | def __init__(self): 49 | # Use weak references to avoid keeping event loops alive 50 | self._loop_pools: weakref.WeakKeyDictionary = ( 51 | weakref.WeakKeyDictionary() 52 | ) 53 | self._lock = asyncio.Lock() 54 | 55 | async def get_pool( 56 | self, verify: ssl.SSLContext | str | bool, timeout: httpx.Timeout 57 | ) -> httpx.AsyncClient: 58 | """Get or create a connection pool for the current event loop.""" 59 | try: 60 | loop = asyncio.get_running_loop() 61 | except RuntimeError: 62 | # No event loop running, return a single-use client 63 | return self._create_client(verify, timeout, pooled=False) 64 | 65 | # Get or create pools dict for this event loop 66 | async with self._lock: 67 | if loop not in self._loop_pools: 68 | self._loop_pools[loop] = {} 69 | # Register cleanup when loop is garbage collected 70 | self._register_loop_cleanup(loop) 71 | 72 | pools = self._loop_pools[loop] 73 | pool_key = self._get_pool_key(verify) 74 | 75 | # Check if we have a valid pool 76 | if pool_key in pools and not pools[pool_key].is_closed: 77 | return pools[pool_key] 78 | 79 | # Create new pool 80 | client = self._create_client(verify, timeout, pooled=True) 81 | pools[pool_key] = client 82 | return client 83 | 84 | def _get_pool_key(self, verify: ssl.SSLContext | str | bool) -> str: 85 | """Generate a key for the connection pool.""" 86 | if isinstance(verify, ssl.SSLContext): 87 | return f"ssl_{id(verify)}" 88 | return str(verify) 89 | 90 | def _create_client( 91 | self, 92 | verify: ssl.SSLContext | str | bool, 93 | timeout: httpx.Timeout, 94 | pooled: bool = True, 95 | ) -> httpx.AsyncClient: 96 | """Create a new HTTP client.""" 97 | if pooled: 98 | limits = httpx.Limits( 99 | max_keepalive_connections=20, 100 | max_connections=100, 101 | keepalive_expiry=30, 102 | ) 103 | else: 104 | # Single-use client 105 | limits = httpx.Limits(max_keepalive_connections=0) 106 | 107 | return httpx.AsyncClient( 108 | verify=verify, 109 | http2=False, # HTTP/2 can add overhead 110 | timeout=timeout, 111 | limits=limits, 112 | ) 113 | 114 | def _register_loop_cleanup(self, loop: asyncio.AbstractEventLoop): 115 | """Register cleanup when event loop is garbage collected.""" 116 | # Store pools to close when loop is garbage collected 117 | # Note: We can't create weak references to dicts, so we'll 118 | # clean up pools when the loop itself is garbage collected 119 | 120 | def cleanup(): 121 | # Get pools for this loop if they still exist 122 | pools = self._loop_pools.get(loop, {}) 123 | if pools: 124 | # Try to close all clients gracefully 125 | for client in list(pools.values()): 126 | if client and not client.is_closed: 127 | # Close synchronously since loop might be gone 128 | import contextlib 129 | 130 | with contextlib.suppress(Exception): 131 | client._transport.close() 132 | 133 | # Register finalizer on the loop itself 134 | weakref.finalize(loop, cleanup) 135 | 136 | async def close_all(self): 137 | """Close all connection pools.""" 138 | async with self._lock: 139 | all_clients = [] 140 | for pools in self._loop_pools.values(): 141 | all_clients.extend(pools.values()) 142 | 143 | # Close all clients 144 | close_tasks = [] 145 | for client in all_clients: 146 | if client and not client.is_closed: 147 | close_tasks.append(client.aclose()) 148 | 149 | if close_tasks: 150 | await asyncio.gather(*close_tasks, return_exceptions=True) 151 | 152 | self._loop_pools.clear() 153 | 154 | 155 | # Global instance 156 | _pool_manager = EventLoopConnectionPools() 157 | 158 | 159 | async def get_connection_pool( 160 | verify: ssl.SSLContext | str | bool, 161 | timeout: httpx.Timeout, 162 | ) -> httpx.AsyncClient: 163 | """Get a connection pool for the current event loop.""" 164 | return await _pool_manager.get_pool(verify, timeout) 165 | 166 | 167 | async def close_all_pools(): 168 | """Close all connection pools.""" 169 | await _pool_manager.close_all() 170 | ``` -------------------------------------------------------------------------------- /src/biomcp/parameter_parser.py: -------------------------------------------------------------------------------- ```python 1 | """Parameter parsing and validation for BioMCP.""" 2 | 3 | import json 4 | import logging 5 | from typing import Any 6 | 7 | from biomcp.exceptions import InvalidParameterError 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ParameterParser: 13 | """Handles parameter parsing and validation for search requests.""" 14 | 15 | @staticmethod 16 | def parse_list_param( 17 | param: str | list[str] | None, param_name: str 18 | ) -> list[str] | None: 19 | """Convert various input formats to lists. 20 | 21 | Handles: 22 | - JSON arrays: '["item1", "item2"]' -> ['item1', 'item2'] 23 | - Comma-separated: 'item1, item2' -> ['item1', 'item2'] 24 | - Single values: 'item' -> ['item'] 25 | - None values: None -> None 26 | - Already parsed lists: ['item'] -> ['item'] 27 | 28 | Args: 29 | param: The parameter to parse 30 | param_name: Name of the parameter for error messages 31 | 32 | Returns: 33 | Parsed list or None 34 | 35 | Raises: 36 | InvalidParameterError: If parameter cannot be parsed 37 | """ 38 | if param is None: 39 | return None 40 | 41 | if isinstance(param, str): 42 | # First try to parse as JSON array 43 | if param.startswith("["): 44 | try: 45 | parsed = json.loads(param) 46 | if not isinstance(parsed, list): 47 | raise InvalidParameterError( 48 | param_name, 49 | param, 50 | "JSON array or comma-separated string", 51 | ) 52 | return parsed 53 | except (json.JSONDecodeError, TypeError) as e: 54 | logger.debug(f"Failed to parse {param_name} as JSON: {e}") 55 | 56 | # If it's a comma-separated string, split it 57 | if "," in param: 58 | return [item.strip() for item in param.split(",")] 59 | 60 | # Otherwise return as single-item list 61 | return [param] 62 | 63 | # If it's already a list, validate and return as-is 64 | if isinstance(param, list): 65 | # Validate all items are strings 66 | if not all(isinstance(item, str) for item in param): 67 | raise InvalidParameterError( 68 | param_name, param, "list of strings" 69 | ) 70 | return param 71 | 72 | # Invalid type 73 | raise InvalidParameterError( 74 | param_name, param, "string, list of strings, or None" 75 | ) 76 | 77 | @staticmethod 78 | def normalize_phase(phase: str | None) -> str | None: 79 | """Normalize phase values for clinical trials. 80 | 81 | Converts various formats to standard enum values: 82 | - "Phase 3" -> "PHASE3" 83 | - "phase 3" -> "PHASE3" 84 | - "PHASE 3" -> "PHASE3" 85 | - "phase3" -> "PHASE3" 86 | 87 | Args: 88 | phase: Phase value to normalize 89 | 90 | Returns: 91 | Normalized phase value or None 92 | """ 93 | if phase is None: 94 | return None 95 | 96 | # Convert to uppercase and remove spaces 97 | normalized = phase.upper().replace(" ", "") 98 | 99 | # Validate it matches expected pattern 100 | valid_phases = [ 101 | "EARLYPHASE1", 102 | "PHASE1", 103 | "PHASE2", 104 | "PHASE3", 105 | "PHASE4", 106 | "NOTAPPLICABLE", 107 | ] 108 | if normalized not in valid_phases: 109 | # Try to be helpful with common mistakes 110 | if "EARLY" in normalized and "1" in normalized: 111 | return "EARLYPHASE1" 112 | if "NOT" in normalized and "APPLICABLE" in normalized: 113 | return "NOTAPPLICABLE" 114 | 115 | raise InvalidParameterError( 116 | "phase", phase, f"one of: {', '.join(valid_phases)}" 117 | ) 118 | 119 | return normalized 120 | 121 | @staticmethod 122 | def validate_page_params(page: int, page_size: int) -> tuple[int, int]: 123 | """Validate pagination parameters. 124 | 125 | Args: 126 | page: Page number (minimum 1) 127 | page_size: Results per page (1-100) 128 | 129 | Returns: 130 | Validated (page, page_size) tuple 131 | 132 | Raises: 133 | InvalidParameterError: If parameters are invalid 134 | """ 135 | if page < 1: 136 | raise InvalidParameterError("page", page, "integer >= 1") 137 | 138 | if page_size < 1 or page_size > 100: 139 | raise InvalidParameterError( 140 | "page_size", page_size, "integer between 1 and 100" 141 | ) 142 | 143 | return page, page_size 144 | 145 | @staticmethod 146 | def parse_search_params( 147 | params: dict[str, Any], domain: str 148 | ) -> dict[str, Any]: 149 | """Parse and validate all search parameters for a domain. 150 | 151 | Args: 152 | params: Raw parameters dictionary 153 | domain: Domain being searched 154 | 155 | Returns: 156 | Validated parameters dictionary 157 | """ 158 | parsed: dict[str, Any] = {} 159 | 160 | # Common list parameters 161 | list_params = [ 162 | "genes", 163 | "diseases", 164 | "variants", 165 | "chemicals", 166 | "keywords", 167 | "conditions", 168 | "interventions", 169 | ] 170 | 171 | for param_name in list_params: 172 | if param_name in params and params[param_name] is not None: 173 | parsed[param_name] = ParameterParser.parse_list_param( 174 | params[param_name], param_name 175 | ) 176 | 177 | # Domain-specific parameters 178 | if ( 179 | domain == "trial" 180 | and "phase" in params 181 | and params.get("phase") is not None 182 | ): 183 | parsed["phase"] = ParameterParser.normalize_phase( 184 | params.get("phase") 185 | ) 186 | 187 | # Pass through other parameters 188 | for key, value in params.items(): 189 | if key not in parsed and key not in list_params and key != "phase": 190 | parsed[key] = value 191 | 192 | return parsed 193 | ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/drug_labels.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | OpenFDA Drug Labels (SPL) integration. 3 | """ 4 | 5 | import logging 6 | 7 | from .constants import ( 8 | OPENFDA_DEFAULT_LIMIT, 9 | OPENFDA_DISCLAIMER, 10 | OPENFDA_DRUG_LABELS_URL, 11 | OPENFDA_MAX_LIMIT, 12 | ) 13 | from .drug_labels_helpers import ( 14 | build_label_search_query, 15 | format_label_header, 16 | format_label_section, 17 | format_label_summary, 18 | get_default_sections, 19 | get_section_titles, 20 | ) 21 | from .utils import clean_text, format_count, make_openfda_request 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | async def search_drug_labels( 27 | name: str | None = None, 28 | indication: str | None = None, 29 | boxed_warning: bool = False, 30 | section: str | None = None, 31 | limit: int = OPENFDA_DEFAULT_LIMIT, 32 | skip: int = 0, 33 | api_key: str | None = None, 34 | ) -> str: 35 | """ 36 | Search FDA drug product labels (SPL). 37 | 38 | Args: 39 | name: Drug name to search for 40 | indication: Search for drugs indicated for this condition 41 | boxed_warning: Filter for drugs with boxed warnings 42 | section: Specific label section to search 43 | limit: Maximum number of results 44 | skip: Number of results to skip 45 | 46 | api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) 47 | 48 | Returns: 49 | Formatted string with drug label information 50 | """ 51 | if not name and not indication and not section and not boxed_warning: 52 | return ( 53 | "⚠️ Please specify a drug name, indication, or label section to search.\n\n" 54 | "Examples:\n" 55 | "- Search by name: --name 'pembrolizumab'\n" 56 | "- Search by indication: --indication 'melanoma'\n" 57 | "- Search by section: --section 'contraindications'" 58 | ) 59 | 60 | # Build and execute search 61 | search_query = build_label_search_query( 62 | name, indication, boxed_warning, section 63 | ) 64 | params = { 65 | "search": search_query, 66 | "limit": min(limit, OPENFDA_MAX_LIMIT), 67 | "skip": skip, 68 | } 69 | 70 | response, error = await make_openfda_request( 71 | OPENFDA_DRUG_LABELS_URL, params, "openfda_drug_labels", api_key 72 | ) 73 | 74 | if error: 75 | return f"⚠️ Error searching drug labels: {error}" 76 | 77 | if not response or not response.get("results"): 78 | return _format_no_results(name, indication, section) 79 | 80 | results = response["results"] 81 | total = ( 82 | response.get("meta", {}).get("results", {}).get("total", len(results)) 83 | ) 84 | 85 | # Build output 86 | output = ["## FDA Drug Labels\n"] 87 | output.extend(_format_search_summary(name, indication, section, total)) 88 | 89 | # Display results 90 | output.append( 91 | f"### Results (showing {min(len(results), 5)} of {total}):\n" 92 | ) 93 | for i, result in enumerate(results[:5], 1): 94 | output.extend(format_label_summary(result, i)) 95 | 96 | # Add tip for getting full labels 97 | if total > 0 and results and "set_id" in results[0]: 98 | output.append( 99 | "\n💡 **Tip**: Use `biomcp openfda label-get <label_id>` to retrieve " 100 | "the complete label for any drug." 101 | ) 102 | 103 | output.append(f"\n{OPENFDA_DISCLAIMER}") 104 | return "\n".join(output) 105 | 106 | 107 | def _format_no_results( 108 | name: str | None, indication: str | None, section: str | None 109 | ) -> str: 110 | """Format no results message.""" 111 | search_desc = [] 112 | if name: 113 | search_desc.append(f"drug '{name}'") 114 | if indication: 115 | search_desc.append(f"indication '{indication}'") 116 | if section: 117 | search_desc.append(f"section '{section}'") 118 | return f"No drug labels found for {' and '.join(search_desc)}." 119 | 120 | 121 | def _format_search_summary( 122 | name: str | None, indication: str | None, section: str | None, total: int 123 | ) -> list[str]: 124 | """Format the search summary.""" 125 | output = [] 126 | 127 | search_desc = [] 128 | if name: 129 | search_desc.append(f"**Drug**: {name}") 130 | if indication: 131 | search_desc.append(f"**Indication**: {indication}") 132 | if section: 133 | search_desc.append(f"**Section**: {section}") 134 | 135 | if search_desc: 136 | output.append(" | ".join(search_desc)) 137 | output.append(f"**Total Labels Found**: {format_count(total, 'label')}\n") 138 | 139 | return output 140 | 141 | 142 | async def get_drug_label( 143 | set_id: str, 144 | sections: list[str] | None = None, 145 | api_key: str | None = None, 146 | ) -> str: 147 | """ 148 | Get detailed drug label information by set ID. 149 | 150 | Args: 151 | set_id: Label set ID 152 | sections: Specific sections to retrieve (default: key sections) 153 | 154 | api_key: Optional OpenFDA API key (overrides OPENFDA_API_KEY env var) 155 | 156 | Returns: 157 | Formatted string with detailed label information 158 | """ 159 | params = { 160 | "search": f'set_id:"{set_id}"', 161 | "limit": 1, 162 | } 163 | 164 | response, error = await make_openfda_request( 165 | OPENFDA_DRUG_LABELS_URL, params, "openfda_drug_label_detail", api_key 166 | ) 167 | 168 | if error: 169 | return f"⚠️ Error retrieving drug label: {error}" 170 | 171 | if not response or not response.get("results"): 172 | return f"Drug label with ID '{set_id}' not found." 173 | 174 | result = response["results"][0] 175 | 176 | # Use default sections if not specified 177 | if not sections: 178 | sections = get_default_sections() 179 | 180 | # Build output 181 | output = format_label_header(result, set_id) 182 | 183 | # Boxed warning (if exists) 184 | if "boxed_warning" in result: 185 | output.extend(_format_boxed_warning(result["boxed_warning"])) 186 | 187 | # Display requested sections 188 | section_titles = get_section_titles() 189 | for section in sections: 190 | output.extend(format_label_section(result, section, section_titles)) 191 | 192 | output.append(f"\n{OPENFDA_DISCLAIMER}") 193 | return "\n".join(output) 194 | 195 | 196 | def _format_boxed_warning(boxed_warning: list) -> list[str]: 197 | """Format boxed warning section.""" 198 | output = ["### ⚠️ BOXED WARNING\n"] 199 | warning_text = clean_text(" ".join(boxed_warning)) 200 | output.append(warning_text) 201 | output.append("") 202 | return output 203 | ``` -------------------------------------------------------------------------------- /src/biomcp/cli/articles.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import json 3 | from typing import Annotated 4 | 5 | import typer 6 | 7 | from ..articles import fetch 8 | from ..articles.search import PubmedRequest, search_articles 9 | from ..articles.unified import search_articles_unified 10 | 11 | article_app = typer.Typer(help="Search and retrieve biomedical articles.") 12 | 13 | 14 | async def get_article_details( 15 | identifier: str, output_json: bool = False 16 | ) -> str: 17 | """Get article details handling both PMIDs and DOIs with proper output format.""" 18 | # Use the fetch module functions directly to control output format 19 | if fetch.is_doi(identifier): 20 | from ..articles.preprints import fetch_europe_pmc_article 21 | 22 | return await fetch_europe_pmc_article( 23 | identifier, output_json=output_json 24 | ) 25 | elif fetch.is_pmid(identifier): 26 | return await fetch.fetch_articles( 27 | [int(identifier)], full=True, output_json=output_json 28 | ) 29 | else: 30 | # Unknown identifier format 31 | error_data = [ 32 | { 33 | "error": f"Invalid identifier format: {identifier}. Expected either a PMID (numeric) or DOI (10.xxxx/xxxx format)." 34 | } 35 | ] 36 | if output_json: 37 | return json.dumps(error_data, indent=2) 38 | else: 39 | from .. import render 40 | 41 | return render.to_markdown(error_data) 42 | 43 | 44 | @article_app.command("search") 45 | def search_article( 46 | genes: Annotated[ 47 | list[str] | None, 48 | typer.Option( 49 | "--gene", 50 | "-g", 51 | help="Gene name to search for (can be specified multiple times)", 52 | ), 53 | ] = None, 54 | variants: Annotated[ 55 | list[str] | None, 56 | typer.Option( 57 | "--variant", 58 | "-v", 59 | help="Genetic variant to search for (can be specified multiple times)", 60 | ), 61 | ] = None, 62 | diseases: Annotated[ 63 | list[str] | None, 64 | typer.Option( 65 | "--disease", 66 | "-d", 67 | help="Disease name to search for (can be specified multiple times)", 68 | ), 69 | ] = None, 70 | chemicals: Annotated[ 71 | list[str] | None, 72 | typer.Option( 73 | "--chemical", 74 | "-c", 75 | help="Chemical name to search for (can be specified multiple times)", 76 | ), 77 | ] = None, 78 | keywords: Annotated[ 79 | list[str] | None, 80 | typer.Option( 81 | "--keyword", 82 | "-k", 83 | help="Keyword to search for (can be specified multiple times)", 84 | ), 85 | ] = None, 86 | page: Annotated[ 87 | int, 88 | typer.Option( 89 | "--page", 90 | "-p", 91 | help="Page number for pagination (starts at 1)", 92 | ), 93 | ] = 1, 94 | output_json: Annotated[ 95 | bool, 96 | typer.Option( 97 | "--json", 98 | "-j", 99 | help="Render in JSON format", 100 | case_sensitive=False, 101 | ), 102 | ] = False, 103 | include_preprints: Annotated[ 104 | bool, 105 | typer.Option( 106 | "--include-preprints/--no-preprints", 107 | help="Include preprint articles from bioRxiv/medRxiv and Europe PMC", 108 | ), 109 | ] = True, 110 | ): 111 | """Search biomedical research articles""" 112 | request = PubmedRequest( 113 | genes=genes or [], 114 | variants=variants or [], 115 | diseases=diseases or [], 116 | chemicals=chemicals or [], 117 | keywords=keywords or [], 118 | ) 119 | 120 | if include_preprints: 121 | result = asyncio.run( 122 | search_articles_unified( 123 | request, 124 | include_pubmed=True, 125 | include_preprints=True, 126 | output_json=output_json, 127 | ) 128 | ) 129 | else: 130 | result = asyncio.run(search_articles(request, output_json)) 131 | typer.echo(result) 132 | 133 | 134 | @article_app.command("get") 135 | def get_article( 136 | identifiers: Annotated[ 137 | list[str], 138 | typer.Argument( 139 | help="Article identifiers - PubMed IDs (e.g., 38768446) or DOIs (e.g., 10.1101/2024.01.20.23288905)", 140 | ), 141 | ], 142 | full: Annotated[ 143 | bool, 144 | typer.Option( 145 | "--full", 146 | "-f", 147 | help="Whether to fetch full article text (PubMed only)", 148 | ), 149 | ] = False, 150 | output_json: Annotated[ 151 | bool, 152 | typer.Option( 153 | "--json", 154 | "-j", 155 | help="Render in JSON format", 156 | case_sensitive=False, 157 | ), 158 | ] = False, 159 | ): 160 | """ 161 | Retrieve articles by PubMed ID or DOI. 162 | 163 | Supports: 164 | - PubMed IDs for published articles (e.g., 38768446) 165 | - DOIs for Europe PMC preprints (e.g., 10.1101/2024.01.20.23288905) 166 | 167 | For multiple articles, results are returned as a list. 168 | """ 169 | # Handle single identifier 170 | if len(identifiers) == 1: 171 | result = asyncio.run( 172 | get_article_details(identifiers[0], output_json=output_json) 173 | ) 174 | else: 175 | # For multiple identifiers, we need to handle them individually 176 | # since they might be a mix of PMIDs and DOIs 177 | results = [] 178 | for identifier in identifiers: 179 | article_result = asyncio.run( 180 | get_article_details(identifier, output_json=True) 181 | ) 182 | # Parse the result and add to list 183 | try: 184 | article_data = json.loads(article_result) 185 | if isinstance(article_data, list): 186 | results.extend(article_data) 187 | else: 188 | results.append(article_data) 189 | except json.JSONDecodeError: 190 | # This shouldn't happen with our new function 191 | results.append({ 192 | "error": f"Failed to parse result for {identifier}" 193 | }) 194 | 195 | if output_json: 196 | result = json.dumps(results, indent=2) 197 | else: 198 | from .. import render 199 | 200 | result = render.to_markdown(results) 201 | 202 | typer.echo(result) 203 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_extract_gene_aa_change.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for _extract_gene_aa_change method in external.py.""" 2 | 3 | import pytest 4 | 5 | from biomcp.variants.external import ExternalVariantAggregator 6 | 7 | 8 | class TestExtractGeneAAChange: 9 | """Test the _extract_gene_aa_change method.""" 10 | 11 | @pytest.fixture 12 | def aggregator(self): 13 | """Create an ExternalVariantAggregator instance.""" 14 | return ExternalVariantAggregator() 15 | 16 | def test_extract_from_docm(self, aggregator): 17 | """Test extraction from DOCM data.""" 18 | variant_data = {"docm": {"gene": "BRAF", "aa_change": "p.V600E"}} 19 | 20 | result = aggregator._extract_gene_aa_change(variant_data) 21 | assert result == "BRAF V600E" 22 | 23 | def test_extract_from_hgvsp_long_format(self, aggregator): 24 | """Test extraction from hgvsp with long amino acid names.""" 25 | variant_data = { 26 | "cadd": {"gene": {"genename": "TP53"}}, 27 | "hgvsp": ["p.Arg175His"], 28 | } 29 | 30 | result = aggregator._extract_gene_aa_change(variant_data) 31 | # The code doesn't convert all long forms, just checks for Val/Ala 32 | assert result == "TP53 Arg175His" 33 | 34 | def test_extract_from_hgvsp_with_dbnsfp(self, aggregator): 35 | """Test extraction from hgvsp with dbnsfp gene name.""" 36 | variant_data = { 37 | "dbnsfp": {"genename": "EGFR"}, 38 | "hgvsp": ["p.Leu858Arg"], 39 | } 40 | 41 | result = aggregator._extract_gene_aa_change(variant_data) 42 | # The code doesn't convert Leu/Arg to L/R 43 | assert result == "EGFR Leu858Arg" 44 | 45 | def test_extract_from_cadd_data(self, aggregator): 46 | """Test extraction from CADD annotations.""" 47 | variant_data = { 48 | "cadd": { 49 | "gene": {"genename": "KRAS", "prot": {"protpos": 12}}, 50 | "oaa": "G", 51 | "naa": "D", 52 | } 53 | } 54 | 55 | result = aggregator._extract_gene_aa_change(variant_data) 56 | assert result == "KRAS G12D" 57 | 58 | def test_extract_from_docm_without_p_prefix(self, aggregator): 59 | """Test extraction from DOCM without p. prefix.""" 60 | variant_data = {"docm": {"gene": "PIK3CA", "aa_change": "E545K"}} 61 | 62 | result = aggregator._extract_gene_aa_change(variant_data) 63 | assert result == "PIK3CA E545K" 64 | 65 | def test_extract_with_multiple_hgvsp(self, aggregator): 66 | """Test handling of multiple hgvsp entries - should take first.""" 67 | variant_data = { 68 | "cadd": {"gene": {"genename": "BRCA1"}}, 69 | "hgvsp": ["p.Gln1756Ter", "p.Gln1756*"], 70 | } 71 | 72 | result = aggregator._extract_gene_aa_change(variant_data) 73 | # Takes the first one, doesn't convert Gln/Ter 74 | assert result == "BRCA1 Gln1756Ter" 75 | 76 | def test_extract_with_special_characters(self, aggregator): 77 | """Test extraction with special characters in protein change.""" 78 | variant_data = { 79 | "cadd": {"gene": {"genename": "MLH1"}}, 80 | "hgvsp": ["p.Lys618Alafs*9"], 81 | } 82 | 83 | result = aggregator._extract_gene_aa_change(variant_data) 84 | # Should extract the basic AA change pattern 85 | assert result is not None 86 | assert "MLH1" in result 87 | 88 | def test_extract_no_gene_name(self, aggregator): 89 | """Test when gene name is missing.""" 90 | variant_data = {"hgvsp": ["p.Val600Glu"]} 91 | 92 | result = aggregator._extract_gene_aa_change(variant_data) 93 | assert result is None 94 | 95 | def test_extract_no_aa_change(self, aggregator): 96 | """Test when AA change is missing.""" 97 | variant_data = {"cadd": {"gene": {"genename": "BRAF"}}} 98 | 99 | result = aggregator._extract_gene_aa_change(variant_data) 100 | assert result is None 101 | 102 | def test_extract_empty_variant_data(self, aggregator): 103 | """Test with empty variant data.""" 104 | result = aggregator._extract_gene_aa_change({}) 105 | assert result is None 106 | 107 | def test_extract_malformed_hgvsp(self, aggregator): 108 | """Test with malformed HGVS protein notation.""" 109 | variant_data = { 110 | "clinvar": { 111 | "gene": {"symbol": "MYC"}, 112 | "hgvs": {"protein": ["invalid_format"]}, 113 | } 114 | } 115 | 116 | result = aggregator._extract_gene_aa_change(variant_data) 117 | assert result is None 118 | 119 | def test_extract_priority_order(self, aggregator): 120 | """Test that DOCM is prioritized for AA change, CADD for gene name.""" 121 | variant_data = { 122 | "docm": {"gene": "BRAF", "aa_change": "p.V600E"}, 123 | "hgvsp": ["p.Val600Lys"], # Different change 124 | "cadd": { 125 | "gene": {"genename": "WRONG", "prot": {"protpos": 600}}, 126 | "oaa": "V", 127 | "naa": "K", 128 | }, 129 | } 130 | 131 | result = aggregator._extract_gene_aa_change(variant_data) 132 | # CADD is prioritized for gene name, DOCM for AA change 133 | assert result == "WRONG V600E" 134 | 135 | def test_extract_regex_with_val_ala(self, aggregator): 136 | """Test regex extraction when Val/Ala are present.""" 137 | # The code specifically looks for Val or Ala to trigger regex 138 | variant_data = { 139 | "cadd": {"gene": {"genename": "TEST1"}}, 140 | "hgvsp": ["p.Val600Ala"], 141 | } 142 | 143 | result = aggregator._extract_gene_aa_change(variant_data) 144 | # The regex doesn't find a match in "Val600Ala" because it's looking for [A-Z]\d+[A-Z] 145 | # which would match "V600A" but not "Val600Ala" 146 | assert result == "TEST1 Val600Ala" 147 | 148 | def test_extract_handles_exceptions_gracefully(self, aggregator): 149 | """Test that exceptions are handled gracefully.""" 150 | # This should trigger an exception internally but return None 151 | variant_data = { 152 | "cadd": {"gene": {"genename": "GENE"}}, 153 | "hgvsp": None, # This will cause issues 154 | } 155 | 156 | result = aggregator._extract_gene_aa_change(variant_data) 157 | assert result is None 158 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_openfda_unified.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for OpenFDA integration with unified search/fetch tools.""" 2 | 3 | import pytest 4 | 5 | 6 | class TestOpenFDAUnifiedIntegration: 7 | """Test OpenFDA domain integration in unified tools.""" 8 | 9 | def test_openfda_domains_registered(self): 10 | """Test that OpenFDA domains are properly registered in constants.""" 11 | from biomcp.constants import ( 12 | DOMAIN_TO_PLURAL, 13 | PLURAL_TO_DOMAIN, 14 | VALID_DOMAINS, 15 | VALID_DOMAINS_PLURAL, 16 | ) 17 | 18 | # List of OpenFDA domains 19 | openfda_domains = [ 20 | "fda_adverse", 21 | "fda_label", 22 | "fda_device", 23 | "fda_approval", 24 | "fda_recall", 25 | "fda_shortage", 26 | ] 27 | 28 | openfda_plurals = [ 29 | "fda_adverse_events", 30 | "fda_labels", 31 | "fda_device_events", 32 | "fda_approvals", 33 | "fda_recalls", 34 | "fda_shortages", 35 | ] 36 | 37 | # Check that all OpenFDA domains are registered 38 | for domain in openfda_domains: 39 | assert domain in VALID_DOMAINS, f"{domain} not in VALID_DOMAINS" 40 | assert ( 41 | domain in DOMAIN_TO_PLURAL 42 | ), f"{domain} not in DOMAIN_TO_PLURAL" 43 | 44 | # Check plural forms 45 | for plural in openfda_plurals: 46 | assert ( 47 | plural in VALID_DOMAINS_PLURAL 48 | ), f"{plural} not in VALID_DOMAINS_PLURAL" 49 | assert ( 50 | plural in PLURAL_TO_DOMAIN 51 | ), f"{plural} not in PLURAL_TO_DOMAIN" 52 | 53 | # Check mappings are correct 54 | assert DOMAIN_TO_PLURAL["fda_adverse"] == "fda_adverse_events" 55 | assert DOMAIN_TO_PLURAL["fda_label"] == "fda_labels" 56 | assert DOMAIN_TO_PLURAL["fda_device"] == "fda_device_events" 57 | assert DOMAIN_TO_PLURAL["fda_approval"] == "fda_approvals" 58 | assert DOMAIN_TO_PLURAL["fda_recall"] == "fda_recalls" 59 | assert DOMAIN_TO_PLURAL["fda_shortage"] == "fda_shortages" 60 | 61 | assert PLURAL_TO_DOMAIN["fda_adverse_events"] == "fda_adverse" 62 | assert PLURAL_TO_DOMAIN["fda_labels"] == "fda_label" 63 | assert PLURAL_TO_DOMAIN["fda_device_events"] == "fda_device" 64 | assert PLURAL_TO_DOMAIN["fda_approvals"] == "fda_approval" 65 | assert PLURAL_TO_DOMAIN["fda_recalls"] == "fda_recall" 66 | assert PLURAL_TO_DOMAIN["fda_shortages"] == "fda_shortage" 67 | 68 | def test_openfda_search_domain_type_hints(self): 69 | """Test that OpenFDA domains are in search tool type hints.""" 70 | import inspect 71 | 72 | from biomcp.router import search 73 | 74 | # Get the function signature 75 | sig = inspect.signature(search) 76 | domain_param = sig.parameters.get("domain") 77 | 78 | # Check if domain parameter exists 79 | assert ( 80 | domain_param is not None 81 | ), "domain parameter not found in search function" 82 | 83 | # Get the annotation 84 | annotation = domain_param.annotation 85 | 86 | # The annotation should be a Literal type that includes OpenFDA domains 87 | # We can't directly check the Literal values due to how Python handles it, 88 | # but we can verify that it's properly annotated 89 | assert ( 90 | annotation is not None 91 | ), "domain parameter has no type annotation" 92 | 93 | def test_openfda_fetch_domain_type_hints(self): 94 | """Test that OpenFDA domains are in fetch tool type hints.""" 95 | import inspect 96 | 97 | from biomcp.router import fetch 98 | 99 | # Get the function signature 100 | sig = inspect.signature(fetch) 101 | domain_param = sig.parameters.get("domain") 102 | 103 | # Check if domain parameter exists 104 | assert ( 105 | domain_param is not None 106 | ), "domain parameter not found in fetch function" 107 | 108 | # Get the annotation 109 | annotation = domain_param.annotation 110 | 111 | # The annotation should be a Literal type that includes OpenFDA domains 112 | assert ( 113 | annotation is not None 114 | ), "domain parameter has no type annotation" 115 | 116 | @pytest.mark.asyncio 117 | async def test_openfda_search_basic_call(self): 118 | """Test that OpenFDA domain search doesn't raise errors with basic call.""" 119 | from unittest.mock import AsyncMock, patch 120 | 121 | # Mock the OpenFDA search function that will be imported 122 | with patch( 123 | "biomcp.openfda.adverse_events.search_adverse_events", 124 | new_callable=AsyncMock, 125 | ) as mock_search: 126 | mock_search.return_value = ( 127 | "## FDA Adverse Event Reports\n\nTest results" 128 | ) 129 | 130 | from biomcp.router import search 131 | 132 | # This should not raise an error 133 | result = await search( 134 | query=None, # Required parameter 135 | domain="fda_adverse", 136 | chemicals=["test"], 137 | page_size=1, 138 | ) 139 | 140 | # Basic check that result has expected structure 141 | assert isinstance(result, dict) 142 | assert "results" in result 143 | 144 | @pytest.mark.asyncio 145 | async def test_openfda_fetch_basic_call(self): 146 | """Test that OpenFDA domain fetch doesn't raise errors with basic call.""" 147 | from unittest.mock import AsyncMock, patch 148 | 149 | # Mock the OpenFDA get function that will be imported 150 | with patch( 151 | "biomcp.openfda.drug_approvals.get_drug_approval", 152 | new_callable=AsyncMock, 153 | ) as mock_get: 154 | mock_get.return_value = "## Drug Approval Details\n\nTest details" 155 | 156 | from biomcp.router import fetch 157 | 158 | # This should not raise an error 159 | result = await fetch( 160 | id="TEST123", 161 | domain="fda_approval", 162 | ) 163 | 164 | # Basic check that result has expected structure 165 | assert isinstance(result, dict) 166 | assert "title" in result 167 | assert "text" in result 168 | assert "metadata" in result 169 | ``` -------------------------------------------------------------------------------- /tests/tdd/articles/test_preprints.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for preprint search functionality.""" 2 | 3 | from unittest.mock import AsyncMock, patch 4 | 5 | import pytest 6 | 7 | from biomcp.articles.preprints import ( 8 | BiorxivClient, 9 | BiorxivResponse, 10 | BiorxivResult, 11 | EuropePMCClient, 12 | EuropePMCResponse, 13 | PreprintSearcher, 14 | ) 15 | from biomcp.articles.search import PubmedRequest, ResultItem 16 | from biomcp.core import PublicationState 17 | 18 | 19 | class TestBiorxivClient: 20 | """Tests for BiorxivClient.""" 21 | 22 | @pytest.mark.asyncio 23 | async def test_search_biorxiv_success(self): 24 | """Test successful bioRxiv search.""" 25 | client = BiorxivClient() 26 | 27 | # Mock response 28 | mock_response = BiorxivResponse( 29 | collection=[ 30 | BiorxivResult( 31 | doi="10.1101/2024.01.01.123456", 32 | title="Test BRAF Mutation Study", 33 | authors="Smith, J.; Doe, J.", 34 | date="2024-01-01", 35 | abstract="Study about BRAF mutations in cancer.", 36 | server="biorxiv", 37 | ) 38 | ], 39 | total=1, 40 | ) 41 | 42 | with patch("biomcp.http_client.request_api") as mock_request: 43 | mock_request.return_value = (mock_response, None) 44 | 45 | results = await client.search("BRAF") 46 | 47 | assert len(results) == 1 48 | assert results[0].doi == "10.1101/2024.01.01.123456" 49 | assert results[0].title == "Test BRAF Mutation Study" 50 | assert results[0].publication_state == PublicationState.PREPRINT 51 | assert "preprint" in results[0].journal.lower() 52 | 53 | @pytest.mark.asyncio 54 | async def test_search_biorxiv_no_results(self): 55 | """Test bioRxiv search with no results.""" 56 | client = BiorxivClient() 57 | 58 | with patch("biomcp.http_client.request_api") as mock_request: 59 | mock_request.return_value = ( 60 | None, 61 | {"code": 404, "message": "Not found"}, 62 | ) 63 | 64 | results = await client.search("nonexistent") 65 | 66 | assert len(results) == 0 67 | 68 | 69 | class TestEuropePMCClient: 70 | """Tests for EuropePMCClient.""" 71 | 72 | @pytest.mark.asyncio 73 | async def test_search_europe_pmc_success(self): 74 | """Test successful Europe PMC search.""" 75 | client = EuropePMCClient() 76 | 77 | # Mock response 78 | mock_response = EuropePMCResponse( 79 | hitCount=1, 80 | resultList={ 81 | "result": [ 82 | { 83 | "id": "PPR123456", 84 | "doi": "10.1101/2024.01.02.654321", 85 | "title": "TP53 Mutation Analysis", 86 | "authorString": "Johnson, A., Williams, B.", 87 | "journalTitle": "bioRxiv", 88 | "firstPublicationDate": "2024-01-02", 89 | "abstractText": "Analysis of TP53 mutations.", 90 | } 91 | ] 92 | }, 93 | ) 94 | 95 | with patch("biomcp.http_client.request_api") as mock_request: 96 | mock_request.return_value = (mock_response, None) 97 | 98 | results = await client.search("TP53") 99 | 100 | assert len(results) == 1 101 | assert results[0].doi == "10.1101/2024.01.02.654321" 102 | assert results[0].title == "TP53 Mutation Analysis" 103 | assert results[0].publication_state == PublicationState.PREPRINT 104 | 105 | 106 | class TestPreprintSearcher: 107 | """Tests for PreprintSearcher.""" 108 | 109 | @pytest.mark.asyncio 110 | async def test_search_combined_sources(self): 111 | """Test searching across multiple preprint sources.""" 112 | searcher = PreprintSearcher() 113 | 114 | # Mock both clients 115 | mock_biorxiv_results = [ 116 | ResultItem( 117 | doi="10.1101/2024.01.01.111111", 118 | title="BRAF Study 1", 119 | date="2024-01-01", 120 | publication_state=PublicationState.PREPRINT, 121 | ) 122 | ] 123 | 124 | mock_europe_results = [ 125 | ResultItem( 126 | doi="10.1101/2024.01.02.222222", 127 | title="BRAF Study 2", 128 | date="2024-01-02", 129 | publication_state=PublicationState.PREPRINT, 130 | ) 131 | ] 132 | 133 | searcher.biorxiv_client.search = AsyncMock( 134 | return_value=mock_biorxiv_results 135 | ) 136 | searcher.europe_pmc_client.search = AsyncMock( 137 | return_value=mock_europe_results 138 | ) 139 | 140 | request = PubmedRequest(genes=["BRAF"]) 141 | response = await searcher.search(request) 142 | 143 | assert response.count == 2 144 | assert len(response.results) == 2 145 | # Results should be sorted by date (newest first) 146 | assert response.results[0].doi == "10.1101/2024.01.02.222222" 147 | assert response.results[1].doi == "10.1101/2024.01.01.111111" 148 | 149 | @pytest.mark.asyncio 150 | async def test_search_duplicate_removal(self): 151 | """Test that duplicate DOIs are removed.""" 152 | searcher = PreprintSearcher() 153 | 154 | # Create duplicate results with same DOI 155 | duplicate_doi = "10.1101/2024.01.01.999999" 156 | 157 | mock_biorxiv_results = [ 158 | ResultItem( 159 | doi=duplicate_doi, 160 | title="Duplicate Study", 161 | date="2024-01-01", 162 | publication_state=PublicationState.PREPRINT, 163 | ) 164 | ] 165 | 166 | mock_europe_results = [ 167 | ResultItem( 168 | doi=duplicate_doi, 169 | title="Duplicate Study", 170 | date="2024-01-01", 171 | publication_state=PublicationState.PREPRINT, 172 | ) 173 | ] 174 | 175 | searcher.biorxiv_client.search = AsyncMock( 176 | return_value=mock_biorxiv_results 177 | ) 178 | searcher.europe_pmc_client.search = AsyncMock( 179 | return_value=mock_europe_results 180 | ) 181 | 182 | request = PubmedRequest(keywords=["test"]) 183 | response = await searcher.search(request) 184 | 185 | assert response.count == 1 186 | assert len(response.results) == 1 187 | assert response.results[0].doi == duplicate_doi 188 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_render.py: -------------------------------------------------------------------------------- ```python 1 | from biomcp import render 2 | 3 | 4 | def test_render_full_json(data_dir): 5 | input_data = (data_dir / "ct_gov/trials_NCT04280705.json").read_text() 6 | expect_markdown = (data_dir / "ct_gov/trials_NCT04280705.txt").read_text() 7 | markdown = render.to_markdown(input_data) 8 | assert markdown == expect_markdown 9 | 10 | input_data = ( 11 | data_dir / "myvariant/variants_full_braf_v600e.json" 12 | ).read_text() 13 | expect_markdown = ( 14 | data_dir / "myvariant/variants_full_braf_v600e.txt" 15 | ).read_text() 16 | markdown = render.to_markdown(input_data) 17 | print("==" * 100) 18 | print(markdown) 19 | print("==" * 100) 20 | assert markdown == expect_markdown 21 | 22 | 23 | def test_render_with_nones(): 24 | markdown = render.to_markdown(data) 25 | assert ( 26 | markdown 27 | == """# Studies 28 | 29 | ## Protocol Section 30 | 31 | ### Design Module 32 | Study Type: interventional 33 | Phases: phase2 34 | 35 | ### Identification Module 36 | Brief Title: 37 | study of autologous tumor infiltrating lymphocytes in patients with 38 | solid tumors 39 | Nct Id: nct03645928 40 | 41 | ### Status Module 42 | Overall Status: recruiting 43 | 44 | #### Completion Date Struct 45 | Date: 2029-08-09 46 | 47 | #### Start Date Struct 48 | Date: 2019-05-07 49 | """ 50 | ) 51 | 52 | 53 | data = { 54 | "next_page_token": None, 55 | "studies": [ 56 | { 57 | "derived_section": None, 58 | "document_section": None, 59 | "has_results": None, 60 | "protocol_section": { 61 | "arms_interventions_module": None, 62 | "conditions_module": None, 63 | "contacts_locations_module": None, 64 | "description_module": None, 65 | "design_module": { 66 | "design_info": None, 67 | "enrollment_info": None, 68 | "phases": ["phase2"], 69 | "study_type": "interventional", 70 | }, 71 | "eligibility_module": None, 72 | "identification_module": { 73 | "acronym": None, 74 | "brief_title": "study " 75 | "of " 76 | "autologous " 77 | "tumor " 78 | "infiltrating " 79 | "lymphocytes " 80 | "in " 81 | "patients " 82 | "with " 83 | "solid " 84 | "tumors", 85 | "nct_id": "nct03645928", 86 | "official_title": None, 87 | "org_study_id_info": None, 88 | "organization": None, 89 | "secondary_id_infos": None, 90 | }, 91 | "outcomes_module": None, 92 | "oversight_module": None, 93 | "references_module": None, 94 | "sponsor_collaborators_module": None, 95 | "status_module": { 96 | "completion_date_struct": { 97 | "date": "2029-08-09", 98 | "type": None, 99 | }, 100 | "expanded_access_info": None, 101 | "last_known_status": None, 102 | "last_update_post_date_struct": None, 103 | "last_update_submit_date": None, 104 | "overall_status": "recruiting", 105 | "primary_completion_date_struct": None, 106 | "results_first_post_date_struct": None, 107 | "results_first_submit_date": None, 108 | "results_first_submit_qc_date": None, 109 | "start_date_struct": {"date": "2019-05-07", "type": None}, 110 | "status_verified_date": None, 111 | "study_first_post_date_struct": None, 112 | "study_first_submit_date": None, 113 | "study_first_submit_qc_date": None, 114 | "why_stopped": None, 115 | }, 116 | }, 117 | "results_section": None, 118 | }, 119 | ], 120 | } 121 | 122 | 123 | def test_transform_key_protocol_section(): 124 | assert render.transform_key("protocol_section") == "Protocol Section" 125 | 126 | 127 | def test_transform_key_nct_number(): 128 | assert render.transform_key("nct_number") == "Nct Number" 129 | 130 | 131 | def test_transform_key_study_url(): 132 | assert render.transform_key("study_url") == "Study Url" 133 | 134 | 135 | def test_transform_key_allcaps(): 136 | assert render.transform_key("allcaps") == "Allcaps" 137 | 138 | 139 | def test_transform_key_primary_purpose(): 140 | assert render.transform_key("primary_purpose") == "Primary Purpose" 141 | 142 | 143 | def test_transform_key_underscores(): 144 | assert render.transform_key("some_key_name") == "Some Key Name" 145 | 146 | 147 | def test_transform_key_lowercase(): 148 | assert render.transform_key("somekey") == "Somekey" 149 | 150 | 151 | def test_transform_key_nctid(): 152 | assert render.transform_key("nct_id") == "Nct Id" 153 | 154 | 155 | def test_transform_key_4dct(): 156 | assert render.transform_key("4dct") == "4dct" 157 | 158 | 159 | def test_wrap_preserve_newlines_blank(): 160 | assert render.wrap_preserve_newlines("", 20) == [] 161 | 162 | 163 | def test_wrap_preserve_newlines_short_line(): 164 | text = "hello world" 165 | assert render.wrap_preserve_newlines(text, 20) == ["hello world"] 166 | 167 | 168 | def test_wrap_preserve_newlines_long(): 169 | text = "this line is definitely longer than twenty characters" 170 | lines = render.wrap_preserve_newlines(text, 20) 171 | assert len(lines) > 1 172 | assert "this line is" in lines[0] 173 | 174 | 175 | def test_process_scalar_list_fits(): 176 | lines = [] 177 | render.process_scalar_list( 178 | "conditions", 179 | lines, 180 | ["condition1", "condition2"], 181 | ) 182 | assert lines == ["Conditions: condition1, condition2"] 183 | 184 | 185 | def test_process_scalar_list_too_long(): 186 | lines = [] 187 | big_list = ["test_value" * 10, "another" * 5] 188 | render.process_scalar_list("giant_field", lines, big_list) 189 | assert lines[0].startswith("Giant Field:") 190 | assert lines[1].startswith("- test_value") 191 | 192 | 193 | def test_render_key_value_short(): 194 | lines = [] 195 | render.render_key_value(lines, "nct_number", "nct100") 196 | assert lines == ["Nct Number: nct100"] 197 | 198 | 199 | def test_render_key_value_long(): 200 | lines = [] 201 | render.render_key_value(lines, "brief_summary", "hello " * 15) 202 | # first line "brief summary:" 203 | assert lines[0] == "Brief Summary:" 204 | assert lines[1].startswith(" hello hello") 205 | ``` -------------------------------------------------------------------------------- /src/biomcp/articles/search_optimized.py: -------------------------------------------------------------------------------- ```python 1 | """Optimized article search with caching and parallel processing.""" 2 | 3 | import asyncio 4 | import hashlib 5 | 6 | from .. import ensure_list 7 | from ..shared_context import get_search_context 8 | from ..utils.request_cache import get_cache 9 | from .search import PubmedRequest 10 | from .unified import search_articles_unified 11 | 12 | # Cache for article search results (5 minute TTL) 13 | _search_cache = get_cache("article_search", ttl_seconds=300) 14 | 15 | 16 | def _get_search_cache_key( 17 | request: PubmedRequest, include_preprints: bool, include_cbioportal: bool 18 | ) -> str: 19 | """Generate a cache key for search requests.""" 20 | # Create a deterministic key from search parameters 21 | key_parts = [ 22 | f"chemicals:{sorted(request.chemicals)}", 23 | f"diseases:{sorted(request.diseases)}", 24 | f"genes:{sorted(request.genes)}", 25 | f"keywords:{sorted(request.keywords)}", 26 | f"variants:{sorted(request.variants)}", 27 | f"preprints:{include_preprints}", 28 | f"cbioportal:{include_cbioportal}", 29 | ] 30 | key_string = "|".join(key_parts) 31 | return hashlib.sha256(key_string.encode()).hexdigest() 32 | 33 | 34 | async def article_searcher_optimized( 35 | call_benefit: str, 36 | chemicals: list[str] | str | None = None, 37 | diseases: list[str] | str | None = None, 38 | genes: list[str] | str | None = None, 39 | keywords: list[str] | str | None = None, 40 | variants: list[str] | str | None = None, 41 | include_preprints: bool = True, 42 | include_cbioportal: bool = True, 43 | ) -> str: 44 | """Optimized version of article_searcher with caching and context reuse.""" 45 | 46 | # Convert parameters to PubmedRequest 47 | request = PubmedRequest( 48 | chemicals=ensure_list(chemicals, split_strings=True), 49 | diseases=ensure_list(diseases, split_strings=True), 50 | genes=ensure_list(genes, split_strings=True), 51 | keywords=ensure_list(keywords, split_strings=True), 52 | variants=ensure_list(variants, split_strings=True), 53 | ) 54 | 55 | # Check cache first 56 | cache_key = _get_search_cache_key( 57 | request, include_preprints, include_cbioportal 58 | ) 59 | cached_result = await _search_cache.get(cache_key) 60 | if cached_result is not None: 61 | return cached_result 62 | 63 | # Check if we're in a search context (for reusing validated entities) 64 | context = get_search_context() 65 | if context and request.genes: 66 | # Pre-validate genes using cached results 67 | valid_genes = [] 68 | for gene in request.genes: 69 | if await context.validate_gene(gene): 70 | valid_genes.append(gene) 71 | request.genes = valid_genes 72 | 73 | # Check if we have cached cBioPortal summaries 74 | if include_cbioportal and request.genes: 75 | for gene in request.genes[:1]: # Just first gene 76 | summary = context.get_gene_summary(gene) 77 | if summary: 78 | # We have a cached summary, can skip that part 79 | pass 80 | 81 | # Perform the search 82 | result = await search_articles_unified( 83 | request, 84 | include_pubmed=True, 85 | include_preprints=include_preprints, 86 | include_cbioportal=include_cbioportal, 87 | ) 88 | 89 | # Cache the result (5 minute TTL) 90 | await _search_cache.set(cache_key, result, ttl=300) 91 | 92 | return result 93 | 94 | 95 | # Additional optimization: Batch article searches 96 | class ArticleSearchBatcher: 97 | """Batch multiple article searches to reduce overhead.""" 98 | 99 | def __init__(self, batch_size: int = 5, timeout: float = 0.1): 100 | self.batch_size = batch_size 101 | self.timeout = timeout 102 | self._pending_searches: list[tuple[PubmedRequest, asyncio.Future]] = [] 103 | self._batch_task: asyncio.Task | None = None 104 | 105 | async def search(self, request: PubmedRequest) -> str: 106 | """Add a search to the batch.""" 107 | future = asyncio.get_event_loop().create_future() 108 | self._pending_searches.append((request, future)) 109 | 110 | # Start batch processing if not already running 111 | if self._batch_task is None or self._batch_task.done(): 112 | self._batch_task = asyncio.create_task(self._process_batch()) 113 | 114 | return await future 115 | 116 | async def _process_batch(self): 117 | """Process pending searches in batch.""" 118 | await asyncio.sleep(self.timeout) # Wait for more requests 119 | 120 | if not self._pending_searches: 121 | return 122 | 123 | # Take up to batch_size searches 124 | batch = self._pending_searches[: self.batch_size] 125 | self._pending_searches = self._pending_searches[self.batch_size :] 126 | 127 | # Process searches in parallel 128 | search_tasks = [] 129 | for request, _ in batch: 130 | task = search_articles_unified(request, include_pubmed=True) 131 | search_tasks.append(task) 132 | 133 | results = await asyncio.gather(*search_tasks, return_exceptions=True) 134 | 135 | # Set results on futures 136 | for (_, future), result in zip(batch, results, strict=False): 137 | if isinstance(result, Exception): 138 | future.set_exception(result) 139 | else: 140 | future.set_result(result) 141 | 142 | 143 | # Global batcher instance 144 | _article_batcher = ArticleSearchBatcher() 145 | 146 | 147 | async def article_searcher_batched( 148 | call_benefit: str, 149 | chemicals: list[str] | str | None = None, 150 | diseases: list[str] | str | None = None, 151 | genes: list[str] | str | None = None, 152 | keywords: list[str] | str | None = None, 153 | variants: list[str] | str | None = None, 154 | include_preprints: bool = True, 155 | include_cbioportal: bool = True, 156 | ) -> str: 157 | """Batched version of article_searcher for multiple concurrent searches.""" 158 | 159 | request = PubmedRequest( 160 | chemicals=ensure_list(chemicals, split_strings=True), 161 | diseases=ensure_list(diseases, split_strings=True), 162 | genes=ensure_list(genes, split_strings=True), 163 | keywords=ensure_list(keywords, split_strings=True), 164 | variants=ensure_list(variants, split_strings=True), 165 | ) 166 | 167 | # Use the optimized version with caching 168 | return await article_searcher_optimized( 169 | call_benefit=call_benefit, 170 | chemicals=request.chemicals, 171 | diseases=request.diseases, 172 | genes=request.genes, 173 | keywords=request.keywords, 174 | variants=request.variants, 175 | include_preprints=include_preprints, 176 | include_cbioportal=include_cbioportal, 177 | ) 178 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_cbioportal_mutations.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for cBioPortal mutation-specific search functionality.""" 2 | 3 | import pytest 4 | 5 | from biomcp.utils.mutation_filter import MutationFilter 6 | from biomcp.variants.cbioportal_mutations import ( 7 | CBioPortalMutationClient, 8 | MutationHit, 9 | StudyMutationSummary, 10 | format_mutation_search_result, 11 | ) 12 | 13 | 14 | class TestCBioPortalMutationSearch: 15 | """Test mutation-specific search functionality.""" 16 | 17 | @pytest.mark.asyncio 18 | @pytest.mark.integration 19 | async def test_search_specific_mutation_srsf2_f57y(self): 20 | """Test searching for SRSF2 F57Y mutation.""" 21 | client = CBioPortalMutationClient() 22 | 23 | result = await client.search_specific_mutation( 24 | gene="SRSF2", mutation="F57Y", max_studies=10 25 | ) 26 | 27 | assert result is not None 28 | assert result.gene == "SRSF2" 29 | assert result.specific_mutation == "F57Y" 30 | assert result.studies_with_mutation >= 0 31 | 32 | # If mutations found, check structure 33 | if result.studies_with_mutation > 0: 34 | assert len(result.top_studies) > 0 35 | top_study = result.top_studies[0] 36 | assert isinstance(top_study, StudyMutationSummary) 37 | assert top_study.mutation_count > 0 38 | 39 | @pytest.mark.asyncio 40 | @pytest.mark.integration 41 | async def test_search_mutation_pattern_srsf2_f57(self): 42 | """Test searching for SRSF2 F57* mutations.""" 43 | client = CBioPortalMutationClient() 44 | 45 | result = await client.search_specific_mutation( 46 | gene="SRSF2", pattern="F57*", max_studies=10 47 | ) 48 | 49 | assert result is not None 50 | assert result.gene == "SRSF2" 51 | assert result.pattern == "F57*" 52 | 53 | # F57* should match F57Y, F57C, etc. 54 | if result.total_mutations > 0: 55 | assert result.mutation_types is not None 56 | # Check that we found some F57 mutations 57 | f57_mutations = [ 58 | mut for mut in result.mutation_types if mut.startswith("F57") 59 | ] 60 | assert len(f57_mutations) > 0 61 | 62 | @pytest.mark.asyncio 63 | @pytest.mark.integration 64 | async def test_search_braf_v600e(self): 65 | """Test searching for BRAF V600E - a very common mutation.""" 66 | client = CBioPortalMutationClient() 67 | 68 | result = await client.search_specific_mutation( 69 | gene="BRAF", mutation="V600E", max_studies=20 70 | ) 71 | 72 | assert result is not None 73 | assert result.gene == "BRAF" 74 | assert result.specific_mutation == "V600E" 75 | # V600E is very common, should have many studies 76 | assert result.studies_with_mutation > 10 77 | assert len(result.top_studies) > 0 78 | 79 | # Check melanoma is in top cancer types 80 | cancer_types = [s.cancer_type for s in result.top_studies] 81 | # At least some melanoma studies should have V600E 82 | assert any("melanoma" in ct.lower() for ct in cancer_types) 83 | 84 | def test_filter_mutations_specific(self): 85 | """Test filtering for specific mutations.""" 86 | mutations = [ 87 | MutationHit( 88 | study_id="study1", 89 | molecular_profile_id="study1_mutations", 90 | protein_change="F57Y", 91 | mutation_type="Missense", 92 | ), 93 | MutationHit( 94 | study_id="study1", 95 | molecular_profile_id="study1_mutations", 96 | protein_change="F57C", 97 | mutation_type="Missense", 98 | ), 99 | MutationHit( 100 | study_id="study2", 101 | molecular_profile_id="study2_mutations", 102 | protein_change="R88Q", 103 | mutation_type="Missense", 104 | ), 105 | ] 106 | 107 | # Filter for F57Y 108 | mutation_filter = MutationFilter(specific_mutation="F57Y") 109 | filtered = mutation_filter.filter_mutations(mutations) 110 | assert len(filtered) == 1 111 | assert filtered[0].protein_change == "F57Y" 112 | 113 | def test_filter_mutations_pattern(self): 114 | """Test filtering with wildcard patterns.""" 115 | mutations = [ 116 | MutationHit( 117 | study_id="study1", 118 | molecular_profile_id="study1_mutations", 119 | protein_change="F57Y", 120 | mutation_type="Missense", 121 | ), 122 | MutationHit( 123 | study_id="study1", 124 | molecular_profile_id="study1_mutations", 125 | protein_change="F57C", 126 | mutation_type="Missense", 127 | ), 128 | MutationHit( 129 | study_id="study2", 130 | molecular_profile_id="study2_mutations", 131 | protein_change="R88Q", 132 | mutation_type="Missense", 133 | ), 134 | ] 135 | 136 | # Filter for F57* 137 | mutation_filter = MutationFilter(pattern="F57*") 138 | filtered = mutation_filter.filter_mutations(mutations) 139 | assert len(filtered) == 2 140 | assert all(m.protein_change.startswith("F57") for m in filtered) 141 | 142 | def test_format_mutation_search_result(self): 143 | """Test formatting of mutation search results.""" 144 | from biomcp.variants.cbioportal_mutations import MutationSearchResult 145 | 146 | result = MutationSearchResult( 147 | gene="SRSF2", 148 | specific_mutation="F57Y", 149 | total_studies=100, 150 | studies_with_mutation=3, 151 | total_mutations=5, 152 | top_studies=[ 153 | StudyMutationSummary( 154 | study_id="msk_ch_2023", 155 | study_name="Cancer Therapy and Clonal Hematopoiesis", 156 | cancer_type="mixed", 157 | mutation_count=5, 158 | sample_count=100, 159 | ), 160 | StudyMutationSummary( 161 | study_id="mds_mskcc_2020", 162 | study_name="Myelodysplastic Syndrome Study", 163 | cancer_type="mds", 164 | mutation_count=2, 165 | sample_count=50, 166 | ), 167 | ], 168 | mutation_types={"F57Y": 5}, 169 | ) 170 | 171 | formatted = format_mutation_search_result(result) 172 | 173 | assert "SRSF2" in formatted 174 | assert "F57Y" in formatted 175 | assert "**Studies with Mutation**: 3" in formatted 176 | assert "msk_ch_2023" in formatted 177 | assert "| 5 |" in formatted # mutation count 178 | ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/06-pubtator3.md: -------------------------------------------------------------------------------- ```markdown 1 | # PubTator3 API 2 | 3 | This document describes the PubTator3 API used by BioMCP for searching biomedical literature and retrieving article details with annotations. Understanding this API provides context for how BioMCP's article commands function. 4 | 5 | ## Overview 6 | 7 | The PubTator3 API provides a way to search for and retrieve biomedical articles 8 | with entity annotations. This document outlines the API implementation details. 9 | PubTator3 is a web-based tool that provides annotations of biomedical entities 10 | in PubMed abstracts and PMC full-text articles. BioMCP uses the PubTator3 API 11 | to search for and retrieve biomedical articles and their annotated entities ( 12 | genes, variants, diseases, chemicals, etc.). 13 | 14 | > **CLI Documentation**: For information on using these APIs through the BioMCP 15 | > command line interface, see 16 | > the [Articles CLI Documentation](../user-guides/01-command-line-interface.md#article-commands). 17 | 18 | ## Usage Guide 19 | 20 | For practical examples of searching articles with PubTator3, see [How to Find Articles and cBioPortal Data](../how-to-guides/01-find-articles-and-cbioportal-data.md). 21 | 22 | ## API Workflow 23 | 24 | The PubTator3 integration follows a three-step workflow: 25 | 26 | 1. **Entity Autocomplete**: Get standardized entity identifiers 27 | 2. **Search**: Find articles using entity identifiers and keywords 28 | 3. **Fetch**: Retrieve full article details by PMID 29 | 30 | ## API Endpoints 31 | 32 | ### Entity Autocomplete API 33 | 34 | **Endpoint:** 35 | `https://www.ncbi.nlm.nih.gov/research/pubtator3-api/entity/autocomplete/` 36 | 37 | This endpoint helps normalize entity names to their standard identifiers, 38 | improving search precision. 39 | 40 | #### Parameters 41 | 42 | | Parameter | Description | Example | 43 | | --------- | --------------------------- | ----------------------------------- | 44 | | `query` | Text to autocomplete | `BRAF` | 45 | | `concept` | Entity type | `GENE`, `CHEMICAL`, `DISEASE`, etc. | 46 | | `limit` | Number of results to return | `2` | 47 | 48 | #### Example Request and Response 49 | 50 | ```bash 51 | curl "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/entity/autocomplete/?query=BRAF&concept=GENE&limit=2" 52 | ``` 53 | 54 | Response: 55 | 56 | ```json 57 | [ 58 | { 59 | "_id": "@GENE_BRAF", 60 | "biotype": "gene", 61 | "name": "BRAF", 62 | "description": "All Species", 63 | "match": "Matched on name <m>BRAF</m>" 64 | }, 65 | { 66 | "_id": "@GENE_BRAFP1", 67 | "biotype": "gene", 68 | "name": "BRAFP1", 69 | "description": "All Species", 70 | "match": "Matched on name <m>BRAFP1</m>" 71 | } 72 | ] 73 | ``` 74 | 75 | ### Entity Search API 76 | 77 | **Endpoint:** `https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/` 78 | 79 | This endpoint allows searching for PMIDs (PubMed IDs) based on entity 80 | identifiers and keywords. 81 | 82 | #### Parameters 83 | 84 | | Parameter | Description | Example | 85 | | --------- | ------------------------------- | ---------------------- | 86 | | `text` | Entity identifier or text query | `@CHEMICAL_remdesivir` | 87 | 88 | #### Example Request and Response 89 | 90 | ```bash 91 | curl "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/?text=@CHEMICAL_remdesivir" 92 | ``` 93 | 94 | Response (truncated): 95 | 96 | ```json 97 | { 98 | "results": [ 99 | { 100 | "_id": "37711410", 101 | "pmid": 37711410, 102 | "title": "Remdesivir.", 103 | "journal": "Hosp Pharm", 104 | "authors": ["Levien TL", "Baker DE"], 105 | "date": "2023-10-01T00:00:00Z", 106 | "doi": "10.1177/0018578721999804", 107 | "meta_date_publication": "2023 Oct", 108 | "meta_volume": "58" 109 | } 110 | // More results... 111 | ] 112 | } 113 | ``` 114 | 115 | ### Article Fetch API 116 | 117 | **Endpoint:** 118 | `https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson` 119 | 120 | This endpoint retrieves detailed information about specific articles, including 121 | annotations. 122 | 123 | #### Parameters 124 | 125 | | Parameter | Description | Example | 126 | | ----------- | --------------------------------------------- | ---------- | 127 | | `pmids` | List of PubMed IDs to retrieve | `29355051` | 128 | | `full_text` | Whether to include full text (when available) | `true` | 129 | 130 | #### Example Request 131 | 132 | ```bash 133 | curl "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson?pmids=29355051&full=true" 134 | ``` 135 | 136 | Response format (truncated): 137 | 138 | ```json 139 | { 140 | "PubTator3": [ 141 | { 142 | "_id": "29355051|PMC6142073", 143 | "id": "6142073", 144 | "infons": {}, 145 | "passages": [ 146 | { 147 | "infons": { 148 | "name_3": "surname:Hu;given-names:Minghua", 149 | "name_2": "surname:Luo;given-names:Xia", 150 | "name_1": "surname:Luo;given-names:Shuang", 151 | "article-id_pmid": "29355051" 152 | // More metadata... 153 | } 154 | } 155 | // More passages... 156 | ] 157 | } 158 | ] 159 | } 160 | ``` 161 | 162 | ## Entity Types 163 | 164 | PubTator3 annotates several types of biomedical entities: 165 | 166 | 1. **Genes/Proteins**: Gene or protein names (e.g., BRAF, TP53) 167 | 2. **Genetic Variants**: Genetic variations (e.g., BRAF V600E) 168 | 3. **Diseases**: Disease names and conditions (e.g., Melanoma) 169 | 4. **Chemicals/Drugs**: Chemical substances or drugs (e.g., Vemurafenib) 170 | 171 | ## Integration Strategy for BioMCP 172 | 173 | The recommended workflow for integrating with PubTator3 in BioMCP is: 174 | 175 | 1. **Entity Normalization**: Use the autocomplete API to convert user-provided 176 | entity names to standardized identifiers 177 | 2. **Literature Search**: Use the search API with these identifiers to find 178 | relevant PMIDs 179 | 3. **Data Retrieval**: Fetch detailed article data with annotations using the 180 | fetch API 181 | 182 | This workflow ensures consistent entity handling and optimal search results. 183 | 184 | ## Authentication 185 | 186 | The PubTator3 API is public and does not require authentication for basic 187 | usage. However, there are rate limits in place to prevent abuse. 188 | 189 | ## Rate Limits and Best Practices 190 | 191 | - **Request Limits**: Approximately 30 requests per minute 192 | - **Batch Requests**: For article retrieval, batch multiple PMIDs in a single 193 | request 194 | - **Caching**: Implement caching to minimize repeated requests 195 | - **Specific Queries**: Use specific entity names rather than general terms for 196 | better results 197 | 198 | ## Error Handling 199 | 200 | Common error responses: 201 | 202 | - **400**: Invalid parameters 203 | - **404**: Articles not found 204 | - **429**: Rate limit exceeded 205 | - **500**: Server error 206 | 207 | ## More Information 208 | 209 | For complete API documentation, visit 210 | the [PubTator3 API Documentation](https://www.ncbi.nlm.nih.gov/research/pubtator3/api). 211 | ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/04-clinicaltrials-gov.md: -------------------------------------------------------------------------------- ```markdown 1 | # ClinicalTrials.gov API 2 | 3 | This document outlines the key aspects of the public ClinicalTrials.gov v2 API utilized by BioMCP. Understanding these details can be helpful for advanced users interpreting BioMCP results or for developers extending its capabilities. BioMCP's CLI commands often simplify or combine these parameters for ease of use; refer to the [Trials CLI Documentation](../user-guides/01-command-line-interface.md#trial-commands) for specific command options. 4 | 5 | ## Overview 6 | 7 | The [ClinicalTrials.gov](https://clinicaltrials.gov/) API provides programmatic 8 | access to clinical trial information. This document outlines the API 9 | implementation details for searching and retrieving clinical trial data. 10 | 11 | > **CLI Documentation**: For information on using these APIs through the BioMCP 12 | > command line interface, see the [Trials CLI Documentation](../user-guides/01-command-line-interface.md#trial-commands). 13 | 14 | ## API Endpoints 15 | 16 | ### Search API 17 | 18 | **Endpoint:** `https://clinicaltrials.gov/api/v2/studies` 19 | 20 | This endpoint allows searching for clinical trials using various parameters. 21 | 22 | #### Key Parameters 23 | 24 | | Parameter | Description | Example Value | 25 | | ---------------------- | ----------------------------------- | ----------------------------------------------- | 26 | | `query.cond` | "Conditions or disease" query | `lung cancer` | 27 | | `query.term` | "Other terms" query | `AREA[LastUpdatePostDate]RANGE[2023-01-15,MAX]` | 28 | | `query.intr` | "Intervention/treatment" query | `Vemurafenib` | 29 | | `query.locn` | "Location terms" query | `New York` | 30 | | `query.titles` | "Title/acronym" query | `BRAF Melanoma` | 31 | | `query.outc` | "Outcome measure" query | `overall survival` | 32 | | `query.spons` | "Sponsor/collaborator" query | `National Cancer Institute` | 33 | | `query.lead` | Searches in "LeadSponsorName" field | `MD Anderson` | 34 | | `query.id` | "Study IDs" query (OR semantics) | `NCT04267848` | 35 | | `filter.overallStatus` | Comma-separated list of statuses | `NOT_YET_RECRUITING,RECRUITING` | 36 | | `filter.geo` | Geo-location filter | `distance(39.0035707,-77.1013313,50mi)` | 37 | | `filter.ids` | Filter by NCT IDs (AND semantics) | `NCT04852770,NCT01728545` | 38 | | `filter.advanced` | Advanced filter query | `AREA[StartDate]2022` | 39 | | `sort` | Sort order | `LastUpdatePostDate:desc` | 40 | | `fields` | Fields to return | `NCTId,BriefTitle,OverallStatus,HasResults` | 41 | 42 | | `countTotal` | Count total number of studies | `true` or `false` | 43 | 44 | #### Example Request 45 | 46 | ```bash 47 | curl -X GET "https://clinicaltrials.gov/api/v2/studies?query.cond=Melanoma&query.intr=BRAF" 48 | ``` 49 | 50 | ### Study Details API 51 | 52 | **Endpoint:** `https://clinicaltrials.gov/api/v2/studies/{NCT_ID}` 53 | 54 | This endpoint retrieves detailed information about a specific clinical trial. 55 | 56 | #### Example Request 57 | 58 | ```bash 59 | curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT04267848" 60 | ``` 61 | 62 | #### Response Modules 63 | 64 | The API response contains various modules of information: 65 | 66 | - **protocolSection**: Basic study information, eligibility criteria, and 67 | design 68 | - **resultsSection**: Study outcomes and results (when available) 69 | - **documentSection**: Related documents 70 | - **derivedSection**: Derived data elements 71 | - **annotationsSection**: Additional annotations 72 | 73 | ## Implementation Details 74 | 75 | ### NCT ID Filtering Semantics 76 | 77 | BioMCP uses intelligent filtering when NCT IDs are provided: 78 | 79 | - **ID-only mode**: When NCT IDs are the only filter criteria, `query.id` is used for fast direct lookup 80 | - **Intersection mode**: When NCT IDs are combined with other filters (conditions, interventions, etc.), `filter.ids` is used to ensure results match ALL criteria 81 | 82 | This ensures that specifying NCT IDs restricts results rather than expanding them. 83 | 84 | ### Query Building 85 | 86 | When constructing API queries, parameters must be properly formatted according to the API documentation. 87 | 88 | For implementation details on query building in BioMCP, see the [HTTP Client Developer Guide](../developer-guides/06-http-client-and-caching.md). 89 | 90 | ### Response Parsing 91 | 92 | The API returns data in JSON format (or CSV if specified). Key sections in the 93 | response include: 94 | 95 | - `protocolSection`: Contains study protocol details 96 | - `identificationModule`: Basic identifiers including NCT ID and title 97 | - `statusModule`: Current recruitment status and study dates 98 | - `sponsorCollaboratorsModule`: Information about sponsors and 99 | collaborators 100 | - `designModule`: Study design information including interventions 101 | - `eligibilityModule`: Inclusion/exclusion criteria and eligible population 102 | - `contactsLocationsModule`: Study sites and contact information 103 | - `referencesModule`: Related publications 104 | 105 | ### Error Handling 106 | 107 | The API returns standard HTTP status codes. Common error scenarios include: 108 | 109 | - **404**: Trial not found 110 | - **429**: Rate limit exceeded 111 | - **400**: Invalid query parameters 112 | 113 | For implementation details on error handling in BioMCP, see the [Error Handling Developer Guide](../developer-guides/05-error-handling.md). 114 | 115 | ## Authentication 116 | 117 | The ClinicalTrials.gov API is public and does not require authentication for 118 | basic usage. However, there are rate limits in place. 119 | 120 | ## Rate Limits and Best Practices 121 | 122 | - **Rate Limit**: Approximately 50 requests per minute per IP address 123 | - **Caching**: Implement caching to minimize repeated requests 124 | - **Pagination**: For large result sets, use the pagination functionality with 125 | 126 | - **Focused Queries**: Use specific search terms rather than broad queries to 127 | get more relevant results 128 | - **Field Selection**: Use the fields parameter to request only the data you 129 | need 130 | 131 | ## More Information 132 | 133 | For complete API documentation, visit 134 | the [ClinicalTrials.gov API Documentation](https://clinicaltrials.gov/data-api/about-api) 135 | ``` -------------------------------------------------------------------------------- /docs/how-to-guides/05-logging-and-monitoring-with-bigquery.md: -------------------------------------------------------------------------------- ```markdown 1 | # BigQuery Logging for BioMCP 2 | 3 | This document outlines how BioMCP uses Google BigQuery for logging user interactions and API usage. 4 | 5 | ## Overview 6 | 7 | BioMCP integrates with Google BigQuery to log user interactions, queries, and API usage. This logging provides valuable insights into how the system is being used, helps with debugging, and enables analytics for improving the service. 8 | 9 | ## Prerequisites 10 | 11 | - A Google Cloud Platform (GCP) account 12 | - A BigQuery dataset and table created in your GCP project 13 | - A GCP service account with BigQuery permissions 14 | 15 | ## Setting Up BigQuery for BioMCP 16 | 17 | 1. **Create a BigQuery Dataset and Table** 18 | 19 | - In the Google Cloud Console, navigate to BigQuery 20 | - Create a new dataset (e.g., `biomcp_logs`) 21 | - Create a table within the dataset (e.g., `worker_logs`) with the following schema: 22 | ``` 23 | timestamp: TIMESTAMP 24 | userEmail: STRING 25 | query: STRING 26 | ``` 27 | - Adjust the schema as needed for your specific logging requirements 28 | 29 | 2. **Create a Service Account** 30 | 31 | - Navigate to "IAM & Admin" > "Service Accounts" in the Google Cloud Console 32 | - Create a new service account with a descriptive name (e.g., `biomcp-bigquery-logger`) 33 | - Assign the "BigQuery Data Editor" role to the service account 34 | - Create and download a JSON key for the service account 35 | 36 | 3. **Configure BioMCP with BigQuery Credentials** 37 | 38 | - Open `wrangler.toml` in the BioMCP project 39 | - Update the following variables with your BigQuery information: 40 | ```toml 41 | BQ_PROJECT_ID = "your-gcp-project-id" 42 | BQ_DATASET = "biomcp_logs" 43 | BQ_TABLE = "worker_logs" 44 | ``` 45 | - For the service account key, use Cloudflare's secret management: 46 | ```bash 47 | npx wrangler secret put BQ_SA_KEY_JSON 48 | ``` 49 | When prompted, paste the entire JSON content of your service account key file 50 | 51 | ## How BigQuery Logging Works 52 | 53 | The BioMCP worker uses the following process to log data to BigQuery: 54 | 55 | 1. **Authentication**: The worker generates a JWT token using the service account credentials 56 | 2. **Token Exchange**: The JWT is exchanged for a Google OAuth access token 57 | 3. **Data Insertion**: The worker uses BigQuery's streaming insert API to log events 58 | 59 | The implementation includes: 60 | 61 | - Token caching to minimize authentication requests 62 | - Error handling for failed logging attempts 63 | - Automatic retry logic for transient failures 64 | 65 | ## Logged Information 66 | 67 | By default, the following information is logged to BigQuery: 68 | 69 | - **timestamp**: When the event occurred 70 | - **userEmail**: The email address of the authenticated user (if available) 71 | - **query**: The query or request that was made 72 | 73 | You can extend the logging schema to include additional information as needed. 74 | 75 | ## Accessing and Analyzing Logs 76 | 77 | To access and analyze the logs: 78 | 79 | 1. **Query the BigQuery Table** 80 | 81 | - Use the BigQuery console or SQL to query your logs 82 | - Example query to see recent logs: 83 | ```sql 84 | SELECT timestamp, userEmail, query 85 | FROM `your-project.biomcp_logs.worker_logs` 86 | ORDER BY timestamp DESC 87 | LIMIT 100 88 | ``` 89 | 90 | 2. **Create Visualizations** 91 | 92 | - Use Google Data Studio to create dashboards based on your BigQuery data 93 | - Connect Data Studio to your BigQuery table and create visualizations 94 | 95 | ## Security Considerations 96 | 97 | - The service account key is sensitive information and should be protected 98 | - Use Cloudflare's secret management to store the key securely 99 | - Consider implementing field-level encryption for sensitive data 100 | - Implement data retention policies to comply with privacy regulations 101 | - **IMPORTANT: Never include PHI (Protected Health Information) or PII (Personally Identifiable Information) in queries or logs** 102 | - Ensure all queries are sanitized to remove patient identifiers, medical record numbers, and other sensitive information 103 | - Consider implementing automatic redaction of potential PHI/PII from logs 104 | - Regularly audit logs to ensure compliance with HIPAA and other privacy regulations 105 | - Remember that BigQuery logs are not designed for storing protected health information 106 | 107 | ### Automatic Sanitization 108 | 109 | BioMCP automatically sanitizes sensitive data before logging to BigQuery: 110 | 111 | - **API Keys and Secrets**: Fields containing `api_key`, `apiKey`, `api-key`, `token`, `secret`, or `password` are automatically redacted 112 | - **Nested Objects**: Sanitization works recursively through nested objects and arrays 113 | - **Case-Insensitive**: Field name matching is case-insensitive to catch variations 114 | - **Preserved Structure**: The original request structure is maintained with sensitive values replaced by `[REDACTED]` 115 | 116 | Example of sanitization: 117 | 118 | ```javascript 119 | // Original request 120 | { 121 | "params": { 122 | "arguments": { 123 | "api_key": "AIzaSyB1234567890", 124 | "gene": "BRAF" 125 | } 126 | } 127 | } 128 | 129 | // Sanitized for BigQuery 130 | { 131 | "params": { 132 | "arguments": { 133 | "api_key": "[REDACTED]", 134 | "gene": "BRAF" 135 | } 136 | } 137 | } 138 | ``` 139 | 140 | ### Excluded Queries 141 | 142 | Certain types of queries are automatically excluded from BigQuery logging: 143 | 144 | - **Think Tool Calls**: Any calls to the `think` tool are not logged 145 | - **Thinking Domain**: Queries with `domain="thinking"` or `domain="think"` are excluded 146 | - **Privacy-First Design**: This ensures that internal reasoning and analysis steps remain private 147 | 148 | ## Troubleshooting 149 | 150 | - **Authentication Failures**: Verify that the service account key is correctly formatted and has the necessary permissions 151 | - **Insertion Errors**: Check that the BigQuery table schema matches the data being inserted 152 | - **Missing Logs**: Ensure that the worker has network access to the BigQuery API 153 | 154 | ## Example Code 155 | 156 | The worker includes the following key functions for BigQuery logging: 157 | 158 | - `getBQToken()`: Fetches and caches a BigQuery OAuth token 159 | - `insertEvent()`: Inserts a single row into BigQuery via streaming insert 160 | - `sanitizeObject()`: Recursively sanitizes sensitive fields from objects before logging 161 | 162 | These functions handle the authentication and data insertion process automatically. 163 | 164 | ## Testing 165 | 166 | BioMCP includes comprehensive tests for the BigQuery logging functionality: 167 | 168 | ### JavaScript Tests 169 | 170 | The sanitization logic is tested using Node.js built-in test framework: 171 | 172 | ```bash 173 | # Run JavaScript worker tests 174 | make test-js 175 | 176 | # Or run directly 177 | node --test tests/tdd/workers/test_worker_sanitization.js 178 | ``` 179 | 180 | Tests cover: 181 | 182 | - API key redaction 183 | - Nested sensitive field handling 184 | - Array sanitization 185 | - Case-insensitive field matching 186 | - Think tool detection 187 | - Domain-based filtering 188 | ``` -------------------------------------------------------------------------------- /src/biomcp/organizations/search.py: -------------------------------------------------------------------------------- ```python 1 | """Search functionality for organizations via NCI CTS API.""" 2 | 3 | import logging 4 | from typing import Any 5 | 6 | from ..constants import NCI_ORGANIZATIONS_URL 7 | from ..integrations.cts_api import CTSAPIError, make_cts_request 8 | from ..utils import parse_or_query 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | async def search_organizations( 14 | name: str | None = None, 15 | org_type: str | None = None, 16 | city: str | None = None, 17 | state: str | None = None, 18 | page_size: int = 20, 19 | page: int = 1, 20 | api_key: str | None = None, 21 | ) -> dict[str, Any]: 22 | """ 23 | Search for organizations in the NCI CTS database. 24 | 25 | Args: 26 | name: Organization name to search for (partial match) 27 | org_type: Type of organization (e.g., "industry", "academic") 28 | city: City location 29 | state: State location (2-letter code) 30 | page_size: Number of results per page 31 | page: Page number 32 | api_key: Optional API key (if not provided, uses NCI_API_KEY env var) 33 | 34 | Returns: 35 | Dictionary with search results containing: 36 | - organizations: List of organization records 37 | - total: Total number of results 38 | - page: Current page 39 | - page_size: Results per page 40 | 41 | Raises: 42 | CTSAPIError: If the API request fails 43 | """ 44 | # Build query parameters 45 | params: dict[str, Any] = { 46 | "size": page_size, 47 | } 48 | 49 | # Note: The NCI API doesn't support offset/page pagination for organizations 50 | # It uses cursor-based pagination or returns all results up to size limit 51 | 52 | # Add search filters with correct API parameter names 53 | if name: 54 | params["name"] = name 55 | if org_type: 56 | params["type"] = org_type 57 | if city: 58 | params["org_city"] = city 59 | if state: 60 | params["org_state_or_province"] = state 61 | 62 | try: 63 | # Make API request 64 | response = await make_cts_request( 65 | url=NCI_ORGANIZATIONS_URL, 66 | params=params, 67 | api_key=api_key, 68 | ) 69 | 70 | # Process response - adapt to actual API format 71 | # This is a reasonable structure based on typical REST APIs 72 | organizations = response.get("data", response.get("organizations", [])) 73 | total = response.get("total", len(organizations)) 74 | 75 | return { 76 | "organizations": organizations, 77 | "total": total, 78 | "page": page, 79 | "page_size": page_size, 80 | } 81 | 82 | except CTSAPIError: 83 | raise 84 | except Exception as e: 85 | logger.error(f"Failed to search organizations: {e}") 86 | raise CTSAPIError(f"Organization search failed: {e!s}") from e 87 | 88 | 89 | def format_organization_results(results: dict[str, Any]) -> str: 90 | """ 91 | Format organization search results as markdown. 92 | 93 | Args: 94 | results: Search results dictionary 95 | 96 | Returns: 97 | Formatted markdown string 98 | """ 99 | organizations = results.get("organizations", []) 100 | total = results.get("total", 0) 101 | 102 | if not organizations: 103 | return "No organizations found matching the search criteria." 104 | 105 | # Build markdown output 106 | lines = [ 107 | f"## Organization Search Results ({total} found)", 108 | "", 109 | ] 110 | 111 | for org in organizations: 112 | org_id = org.get("id", org.get("org_id", "Unknown")) 113 | name = org.get("name", "Unknown Organization") 114 | org_type = org.get("type", org.get("category", "Unknown")) 115 | city = org.get("city", "") 116 | state = org.get("state", "") 117 | 118 | lines.append(f"### {name}") 119 | lines.append(f"- **ID**: {org_id}") 120 | lines.append(f"- **Type**: {org_type}") 121 | 122 | if city or state: 123 | location_parts = [p for p in [city, state] if p] 124 | lines.append(f"- **Location**: {', '.join(location_parts)}") 125 | 126 | lines.append("") 127 | 128 | return "\n".join(lines) 129 | 130 | 131 | async def search_organizations_with_or( 132 | name_query: str, 133 | org_type: str | None = None, 134 | city: str | None = None, 135 | state: str | None = None, 136 | page_size: int = 20, 137 | page: int = 1, 138 | api_key: str | None = None, 139 | ) -> dict[str, Any]: 140 | """ 141 | Search for organizations with OR query support. 142 | 143 | This function handles OR queries by making multiple API calls and combining results. 144 | For example: "MD Anderson OR Mayo Clinic" will search for each term. 145 | 146 | Args: 147 | name_query: Name query that may contain OR operators 148 | Other args same as search_organizations 149 | 150 | Returns: 151 | Combined results from all searches with duplicates removed 152 | """ 153 | # Check if this is an OR query 154 | if " OR " in name_query or " or " in name_query: 155 | search_terms = parse_or_query(name_query) 156 | logger.info(f"Parsed OR query into terms: {search_terms}") 157 | else: 158 | # Single term search 159 | search_terms = [name_query] 160 | 161 | # Collect all unique organizations 162 | all_organizations = {} 163 | total_found = 0 164 | 165 | # Search for each term 166 | for term in search_terms: 167 | logger.info(f"Searching organizations for term: {term}") 168 | try: 169 | results = await search_organizations( 170 | name=term, 171 | org_type=org_type, 172 | city=city, 173 | state=state, 174 | page_size=page_size, 175 | page=page, 176 | api_key=api_key, 177 | ) 178 | 179 | # Add unique organizations (deduplicate by ID) 180 | for org in results.get("organizations", []): 181 | org_id = org.get("id", org.get("org_id")) 182 | if org_id and org_id not in all_organizations: 183 | all_organizations[org_id] = org 184 | 185 | total_found += results.get("total", 0) 186 | 187 | except Exception as e: 188 | logger.warning(f"Failed to search for term '{term}': {e}") 189 | # Continue with other terms 190 | 191 | # Convert back to list and apply pagination 192 | unique_organizations = list(all_organizations.values()) 193 | 194 | # Sort by name for consistent results 195 | unique_organizations.sort(key=lambda x: x.get("name", "").lower()) 196 | 197 | # Apply pagination to combined results 198 | start_idx = (page - 1) * page_size 199 | end_idx = start_idx + page_size 200 | paginated_organizations = unique_organizations[start_idx:end_idx] 201 | 202 | return { 203 | "organizations": paginated_organizations, 204 | "total": len(unique_organizations), 205 | "page": page, 206 | "page_size": page_size, 207 | "search_terms": search_terms, # Include what we searched for 208 | "total_found_across_terms": total_found, # Total before deduplication 209 | } 210 | ``` -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- ```yaml 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | properties: {} 9 | commandFunction: 10 | # A JS function that produces the CLI command based on the given config to start the MCP on stdio. 11 | |- 12 | (config) => ({ command: 'biomcp', args: ['run'], env: {} }) 13 | exampleConfig: {} 14 | 15 | schemas: 16 | TrialQuery: 17 | type: object 18 | properties: 19 | conditions: 20 | type: array 21 | items: 22 | type: string 23 | description: "List of condition terms." 24 | terms: 25 | type: array 26 | items: 27 | type: string 28 | description: "General search terms that don't fit specific categories." 29 | interventions: 30 | type: array 31 | items: 32 | type: string 33 | description: "Intervention names." 34 | recruiting_status: 35 | type: string 36 | description: "Study recruitment status." 37 | study_type: 38 | type: string 39 | description: "Type of study." 40 | nct_ids: 41 | type: array 42 | items: 43 | type: string 44 | description: "Clinical trial NCT IDs" 45 | lat: 46 | type: number 47 | description: "Latitude for location search" 48 | long: 49 | type: number 50 | description: "Longitude for location search" 51 | distance: 52 | type: integer 53 | description: "Distance from lat/long in miles" 54 | min_date: 55 | type: string 56 | description: "Minimum date for filtering" 57 | max_date: 58 | type: string 59 | description: "Maximum date for filtering" 60 | date_field: 61 | type: string 62 | description: "Date field to filter on" 63 | phase: 64 | type: string 65 | description: "Trial phase filter" 66 | age_group: 67 | type: string 68 | description: "Age group filter" 69 | primary_purpose: 70 | type: string 71 | description: "Primary purpose of the trial" 72 | intervention_type: 73 | type: string 74 | description: "Type of intervention" 75 | sponsor_type: 76 | type: string 77 | description: "Type of sponsor" 78 | study_design: 79 | type: string 80 | description: "Study design" 81 | sort: 82 | type: string 83 | description: "Sort order for results" 84 | next_page_hash: 85 | type: string 86 | description: "Token to retrieve the next page of results" 87 | 88 | VariantQuery: 89 | type: object 90 | properties: 91 | gene: 92 | type: string 93 | description: "Gene symbol to search for (e.g. BRAF, TP53)" 94 | hgvsp: 95 | type: string 96 | description: "Protein change notation (e.g., p.V600E, p.Arg557His)" 97 | hgvsc: 98 | type: string 99 | description: "cDNA notation (e.g., c.1799T>A)" 100 | rsid: 101 | type: string 102 | description: "dbSNP rsID (e.g., rs113488022)" 103 | region: 104 | type: string 105 | description: "Genomic region as chr:start-end (e.g. chr1:12345-67890)" 106 | significance: 107 | type: string 108 | description: "ClinVar clinical significance" 109 | max_frequency: 110 | type: number 111 | description: "Maximum population allele frequency threshold" 112 | min_frequency: 113 | type: number 114 | description: "Minimum population allele frequency threshold" 115 | cadd: 116 | type: number 117 | description: "Minimum CADD phred score" 118 | polyphen: 119 | type: string 120 | description: "PolyPhen-2 prediction" 121 | sift: 122 | type: string 123 | description: "SIFT prediction" 124 | sources: 125 | type: array 126 | items: 127 | type: string 128 | description: "Include only specific data sources" 129 | size: 130 | type: integer 131 | description: "Number of results to return" 132 | default: 40 133 | offset: 134 | type: integer 135 | description: "Result offset for pagination" 136 | default: 0 137 | 138 | PubmedRequest: 139 | type: object 140 | properties: 141 | chemicals: 142 | type: array 143 | items: 144 | type: string 145 | description: "List of chemicals for filtering results." 146 | diseases: 147 | type: array 148 | items: 149 | type: string 150 | description: "Diseases such as Hypertension, Lung Adenocarcinoma, etc." 151 | genes: 152 | type: array 153 | items: 154 | type: string 155 | description: "List of genes for filtering results." 156 | keywords: 157 | type: array 158 | items: 159 | type: string 160 | description: "List of other keywords for filtering results." 161 | variants: 162 | type: array 163 | items: 164 | type: string 165 | description: "List of variants for filtering results." 166 | 167 | tools: 168 | trial_searcher: 169 | input: 170 | schema: 171 | type: object 172 | properties: 173 | query: 174 | $ref: "#/schemas/TrialQuery" 175 | required: ["query"] 176 | 177 | variant_searcher: 178 | input: 179 | schema: 180 | type: object 181 | properties: 182 | query: 183 | $ref: "#/schemas/VariantQuery" 184 | required: ["query"] 185 | 186 | article_searcher: 187 | input: 188 | schema: 189 | type: object 190 | properties: 191 | query: 192 | $ref: "#/schemas/PubmedRequest" 193 | required: ["query"] 194 | 195 | # Simple string parameter functions 196 | trial_protocol: 197 | input: 198 | schema: 199 | type: object 200 | properties: 201 | nct_id: 202 | type: string 203 | description: "A single NCT ID (e.g., NCT04280705)" 204 | required: ["nct_id"] 205 | 206 | trial_locations: 207 | input: 208 | schema: 209 | type: object 210 | properties: 211 | nct_id: 212 | type: string 213 | description: "A single NCT ID (e.g., NCT04280705)" 214 | required: ["nct_id"] 215 | 216 | trial_outcomes: 217 | input: 218 | schema: 219 | type: object 220 | properties: 221 | nct_id: 222 | type: string 223 | description: "A single NCT ID (e.g., NCT04280705)" 224 | required: ["nct_id"] 225 | 226 | trial_references: 227 | input: 228 | schema: 229 | type: object 230 | properties: 231 | nct_id: 232 | type: string 233 | description: "A single NCT ID (e.g., NCT04280705)" 234 | required: ["nct_id"] 235 | 236 | article_details: 237 | input: 238 | schema: 239 | type: object 240 | properties: 241 | pmid: 242 | type: string 243 | description: "A single PubMed ID (e.g., 34397683)" 244 | required: ["pmid"] 245 | 246 | variant_details: 247 | input: 248 | schema: 249 | type: object 250 | properties: 251 | variant_id: 252 | type: string 253 | description: "A variant identifier (e.g., chr7:g.140453136A>T)" 254 | required: ["variant_id"] 255 | ``` -------------------------------------------------------------------------------- /tests/tdd/openfda/test_adverse_events.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Unit tests for OpenFDA adverse events integration. 3 | """ 4 | 5 | from unittest.mock import patch 6 | 7 | import pytest 8 | 9 | from biomcp.openfda.adverse_events import ( 10 | get_adverse_event, 11 | search_adverse_events, 12 | ) 13 | 14 | 15 | @pytest.mark.asyncio 16 | async def test_search_adverse_events_by_drug(): 17 | """Test searching adverse events by drug name.""" 18 | mock_response = { 19 | "meta": {"results": {"total": 100}}, 20 | "results": [ 21 | { 22 | "patient": { 23 | "drug": [ 24 | { 25 | "medicinalproduct": "IMATINIB", 26 | "openfda": { 27 | "brand_name": ["GLEEVEC"], 28 | "generic_name": ["IMATINIB MESYLATE"], 29 | }, 30 | } 31 | ], 32 | "reaction": [ 33 | {"reactionmeddrapt": "NAUSEA"}, 34 | {"reactionmeddrapt": "FATIGUE"}, 35 | ], 36 | "patientonsetage": "45", 37 | "patientsex": 2, 38 | }, 39 | "serious": "1", 40 | "seriousnesshospitalization": "1", 41 | "receivedate": "20240115", 42 | } 43 | ], 44 | } 45 | 46 | with patch( 47 | "biomcp.openfda.adverse_events.make_openfda_request" 48 | ) as mock_request: 49 | mock_request.return_value = (mock_response, None) 50 | 51 | result = await search_adverse_events(drug="imatinib", limit=10) 52 | 53 | # Verify the request was made correctly 54 | mock_request.assert_called_once() 55 | call_args = mock_request.call_args 56 | assert "imatinib" in call_args[0][1]["search"].lower() 57 | 58 | # Check the output contains expected information 59 | assert "FDA Adverse Event Reports" in result 60 | assert "imatinib" in result.lower() 61 | assert "NAUSEA" in result 62 | assert "FATIGUE" in result 63 | assert "100 reports" in result 64 | 65 | 66 | @pytest.mark.asyncio 67 | async def test_search_adverse_events_by_reaction(): 68 | """Test searching adverse events by reaction.""" 69 | mock_response = { 70 | "meta": {"results": {"total": 50}}, 71 | "results": [ 72 | { 73 | "patient": { 74 | "drug": [{"medicinalproduct": "ASPIRIN"}], 75 | "reaction": [{"reactionmeddrapt": "HEADACHE"}], 76 | }, 77 | "serious": "0", 78 | "receivedate": "20240201", 79 | } 80 | ], 81 | } 82 | 83 | with patch( 84 | "biomcp.openfda.adverse_events.make_openfda_request" 85 | ) as mock_request: 86 | mock_request.return_value = (mock_response, None) 87 | 88 | result = await search_adverse_events(reaction="headache", limit=10) 89 | 90 | # Verify the request 91 | mock_request.assert_called_once() 92 | call_args = mock_request.call_args 93 | assert "headache" in call_args[0][1]["search"].lower() 94 | 95 | # Check output 96 | assert "HEADACHE" in result 97 | assert "50 reports" in result 98 | 99 | 100 | @pytest.mark.asyncio 101 | async def test_search_adverse_events_no_params(): 102 | """Test that searching without parameters returns helpful message.""" 103 | result = await search_adverse_events() 104 | 105 | assert "Please specify" in result 106 | assert "drug name or reaction" in result 107 | assert "Examples:" in result 108 | 109 | 110 | @pytest.mark.asyncio 111 | async def test_search_adverse_events_no_results(): 112 | """Test handling when no results are found.""" 113 | with patch( 114 | "biomcp.openfda.adverse_events.make_openfda_request" 115 | ) as mock_request: 116 | mock_request.return_value = ({"results": []}, None) 117 | 118 | result = await search_adverse_events(drug="nonexistentdrug") 119 | 120 | assert "No adverse event reports found" in result 121 | assert "nonexistentdrug" in result 122 | 123 | 124 | @pytest.mark.asyncio 125 | async def test_search_adverse_events_error(): 126 | """Test error handling in adverse event search.""" 127 | with patch( 128 | "biomcp.openfda.adverse_events.make_openfda_request" 129 | ) as mock_request: 130 | mock_request.return_value = (None, "API rate limit exceeded") 131 | 132 | result = await search_adverse_events(drug="aspirin") 133 | 134 | assert "Error searching adverse events" in result 135 | assert "API rate limit exceeded" in result 136 | 137 | 138 | @pytest.mark.asyncio 139 | async def test_get_adverse_event_detail(): 140 | """Test getting detailed adverse event report.""" 141 | mock_response = { 142 | "results": [ 143 | { 144 | "safetyreportid": "12345678", 145 | "patient": { 146 | "patientonsetage": "55", 147 | "patientsex": 1, 148 | "patientweight": "75", 149 | "drug": [ 150 | { 151 | "medicinalproduct": "DRUG A", 152 | "drugindication": "HYPERTENSION", 153 | "drugdosagetext": "100mg daily", 154 | "drugadministrationroute": "048", 155 | "actiondrug": 4, 156 | } 157 | ], 158 | "reaction": [ 159 | {"reactionmeddrapt": "DIZZINESS", "reactionoutcome": 1} 160 | ], 161 | }, 162 | "serious": "1", 163 | "seriousnesshospitalization": "1", 164 | "receivedate": "20240115", 165 | "reporttype": 1, 166 | } 167 | ] 168 | } 169 | 170 | with patch( 171 | "biomcp.openfda.adverse_events.make_openfda_request" 172 | ) as mock_request: 173 | mock_request.return_value = (mock_response, None) 174 | 175 | result = await get_adverse_event("12345678") 176 | 177 | # Verify request 178 | mock_request.assert_called_once() 179 | call_args = mock_request.call_args 180 | assert "12345678" in call_args[0][1]["search"] 181 | 182 | # Check detailed output 183 | assert "12345678" in result 184 | assert "Patient Information" in result 185 | assert "55 years" in result 186 | assert "Male" in result 187 | assert "75 kg" in result 188 | assert "DRUG A" in result 189 | assert "HYPERTENSION" in result 190 | assert "100mg daily" in result 191 | assert "DIZZINESS" in result 192 | assert "Recovered/Resolved" in result 193 | 194 | 195 | @pytest.mark.asyncio 196 | async def test_get_adverse_event_not_found(): 197 | """Test handling when adverse event report is not found.""" 198 | with patch( 199 | "biomcp.openfda.adverse_events.make_openfda_request" 200 | ) as mock_request: 201 | mock_request.return_value = ({"results": []}, None) 202 | 203 | result = await get_adverse_event("NOTFOUND123") 204 | 205 | assert "NOTFOUND123" in result 206 | assert "not found" in result 207 | ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/adverse_events_helpers.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Helper functions for OpenFDA adverse events to reduce complexity. 3 | """ 4 | 5 | from collections import Counter 6 | from typing import Any 7 | 8 | from .utils import ( 9 | extract_drug_names, 10 | extract_reactions, 11 | format_count, 12 | format_drug_list, 13 | ) 14 | 15 | 16 | def format_search_summary( 17 | drug: str | None, reaction: str | None, serious: bool | None, total: int 18 | ) -> list[str]: 19 | """Format the search summary section.""" 20 | output = [] 21 | 22 | # Add search criteria 23 | search_desc = [] 24 | if drug: 25 | search_desc.append(f"**Drug**: {drug}") 26 | if reaction: 27 | search_desc.append(f"**Reaction**: {reaction}") 28 | if serious is not None: 29 | search_desc.append(f"**Serious Events**: {'Yes' if serious else 'No'}") 30 | 31 | if search_desc: 32 | output.append(" | ".join(search_desc)) 33 | output.append( 34 | f"**Total Reports Found**: {format_count(total, 'report')}\n" 35 | ) 36 | 37 | return output 38 | 39 | 40 | def format_top_reactions(results: list[dict[str, Any]]) -> list[str]: 41 | """Format top reported reactions from search results.""" 42 | output = [] 43 | all_reactions = [] 44 | 45 | for result in results: 46 | all_reactions.extend(extract_reactions(result)) 47 | 48 | if all_reactions: 49 | reaction_counts = Counter(all_reactions) 50 | top_reactions = reaction_counts.most_common(10) 51 | 52 | output.append("### Top Reported Reactions:") 53 | for rxn, count in top_reactions: 54 | percentage = (count / len(results)) * 100 55 | output.append(f"- **{rxn}**: {count} reports ({percentage:.1f}%)") 56 | output.append("") 57 | 58 | return output 59 | 60 | 61 | def format_report_summary( 62 | result: dict[str, Any], report_num: int 63 | ) -> list[str]: 64 | """Format a single report summary.""" 65 | output = [f"#### Report {report_num}"] 66 | 67 | # Extract key information 68 | drugs = extract_drug_names(result) 69 | reactions = extract_reactions(result) 70 | 71 | # Patient info 72 | patient = result.get("patient", {}) 73 | age = patient.get("patientonsetage") 74 | sex_map = {0: "Unknown", 1: "Male", 2: "Female"} 75 | sex = sex_map.get(patient.get("patientsex"), "Unknown") 76 | 77 | # Serious outcomes 78 | serious_flag = result.get("serious", "0") 79 | outcomes = [] 80 | for code in [ 81 | "seriousnessdeath", 82 | "seriousnesslifethreatening", 83 | "seriousnesshospitalization", 84 | "seriousnessdisabling", 85 | ]: 86 | if result.get(code) == "1": 87 | outcomes.append(code.replace("seriousness", "").title()) 88 | 89 | # Format output 90 | output.append(f"- **Drugs**: {format_drug_list(drugs)}") 91 | output.append(f"- **Reactions**: {', '.join(reactions[:5])}") 92 | if age: 93 | output.append(f"- **Patient**: {age} years, {sex}") 94 | if serious_flag == "1" and outcomes: 95 | output.append(f"- **Serious Outcome**: {', '.join(outcomes)}") 96 | 97 | # Dates 98 | receive_date = result.get("receivedate", "") 99 | if receive_date: 100 | output.append( 101 | f"- **Report Date**: {receive_date[:4]}-{receive_date[4:6]}-{receive_date[6:]}" 102 | ) 103 | 104 | output.append("") 105 | return output 106 | 107 | 108 | def format_drug_details(drugs: list[dict[str, Any]]) -> list[str]: 109 | """Format drug information details.""" 110 | from .utils import clean_text 111 | 112 | output = ["### Drug Information"] 113 | 114 | for i, drug in enumerate(drugs, 1): 115 | output.append( 116 | f"\n#### Drug {i}: {drug.get('medicinalproduct', 'Unknown')}" 117 | ) 118 | 119 | if "drugindication" in drug: 120 | output.append(f"- **Indication**: {drug['drugindication']}") 121 | 122 | if "drugdosagetext" in drug: 123 | dosage = clean_text(drug["drugdosagetext"]) 124 | output.append(f"- **Dosage**: {dosage}") 125 | 126 | if "drugadministrationroute" in drug: 127 | output.append(f"- **Route**: {drug['drugadministrationroute']}") 128 | 129 | # Drug action taken 130 | action_map = { 131 | 1: "Drug withdrawn", 132 | 2: "Dose reduced", 133 | 3: "Dose increased", 134 | 4: "Dose not changed", 135 | 5: "Unknown", 136 | 6: "Not applicable", 137 | } 138 | action_code = drug.get("actiondrug") 139 | action = ( 140 | action_map.get(action_code, "Unknown") 141 | if action_code is not None 142 | else "Unknown" 143 | ) 144 | output.append(f"- **Action Taken**: {action}") 145 | 146 | output.append("") 147 | return output 148 | 149 | 150 | def format_reaction_details(reactions: list[dict[str, Any]]) -> list[str]: 151 | """Format adverse reaction details.""" 152 | output = ["### Adverse Reactions"] 153 | 154 | for reaction in reactions: 155 | rxn_name = reaction.get("reactionmeddrapt", "Unknown") 156 | outcome_map = { 157 | 1: "Recovered/Resolved", 158 | 2: "Recovering/Resolving", 159 | 3: "Not recovered/Not resolved", 160 | 4: "Recovered/Resolved with sequelae", 161 | 5: "Fatal", 162 | 6: "Unknown", 163 | } 164 | outcome_code = reaction.get("reactionoutcome") 165 | outcome = ( 166 | outcome_map.get(outcome_code, "Unknown") 167 | if outcome_code is not None 168 | else "Unknown" 169 | ) 170 | output.append(f"- **{rxn_name}**: {outcome}") 171 | 172 | output.append("") 173 | return output 174 | 175 | 176 | def format_report_metadata(result: dict[str, Any]) -> list[str]: 177 | """Format report metadata information.""" 178 | output = ["### Report Information"] 179 | 180 | receive_date = result.get("receivedate", "") 181 | if receive_date: 182 | formatted_date = ( 183 | f"{receive_date[:4]}-{receive_date[4:6]}-{receive_date[6:]}" 184 | ) 185 | output.append(f"- **Report Date**: {formatted_date}") 186 | 187 | report_type_map = { 188 | 1: "Spontaneous", 189 | 2: "Report from study", 190 | 3: "Other", 191 | 4: "Not available to sender", 192 | } 193 | report_type_code = result.get("reporttype") 194 | report_type = ( 195 | report_type_map.get(report_type_code, "Unknown") 196 | if report_type_code is not None 197 | else "Unknown" 198 | ) 199 | output.append(f"- **Report Type**: {report_type}") 200 | 201 | # Seriousness 202 | if result.get("serious") == "1": 203 | outcomes = [] 204 | if result.get("seriousnessdeath") == "1": 205 | outcomes.append("Death") 206 | if result.get("seriousnesslifethreatening") == "1": 207 | outcomes.append("Life-threatening") 208 | if result.get("seriousnesshospitalization") == "1": 209 | outcomes.append("Hospitalization") 210 | if result.get("seriousnessdisabling") == "1": 211 | outcomes.append("Disability") 212 | if result.get("seriousnesscongenitalanomali") == "1": 213 | outcomes.append("Congenital anomaly") 214 | if result.get("seriousnessother") == "1": 215 | outcomes.append("Other serious") 216 | 217 | if outcomes: 218 | output.append(f"- **Serious Outcomes**: {', '.join(outcomes)}") 219 | 220 | return output 221 | ``` -------------------------------------------------------------------------------- /docs/blog/researcher-persona-resource.md: -------------------------------------------------------------------------------- ```markdown 1 | # BioMCP Deep Researcher Persona 2 | 3 | With the release of BioMCP v0.1.2, users can now access a specialized 4 | Researcher Persona that transforms Claude into a rigorous biomedical research 5 | assistant using BioMCP's built-in sequential thinking capabilities. 6 | 7 | This persona is designed to leverage BioMCP's suite of tools for accessing 8 | PubMed articles, ClinicalTrials.gov data, and genomic variant information, 9 | while incorporating Claude's web search capabilities to produce comprehensive, 10 | thoroughly-researched reports. 11 | 12 | ## How to Use the Researcher Persona 13 | 14 | Getting started with the BioMCP Researcher Persona is straightforward: 15 | 16 | 1. Configure Claude Desktop by updating your configuration JSON with: 17 | 18 | ```json 19 | { 20 | "mcpServers": { 21 | "biomcp": { 22 | "command": "uv", 23 | "args": ["run", "--with", "biomcp-python>=0.1.2", "biomcp", "run"] 24 | } 25 | } 26 | } 27 | ``` 28 | 29 | 2. Restart Claude Desktop (the `>=0.1.2` ensures the latest version is used, which includes the built-in think tool) 30 | 31 | 3. Select the "Researcher" persona from the dropdown menu 32 |  33 | 34 | 4. Ask your biomedical research question 35 | 36 | The Researcher Persona will then work through its 10-step process, keeping you 37 | updated on its progress and ultimately producing a comprehensive research 38 | brief. 39 | 40 | ## Video Demonstration 41 | 42 | Below is a video demonstrating the Researcher Persona in action: 43 | 44 | [](https://youtu.be/tBGG53O-7Hg) 45 | 46 | ## Sequential Thinking: A Rigorous 10-Step Research Process 47 | 48 | What makes the Researcher Persona so powerful is its integration with BioMCP's 49 | built-in 'think' tool, which guides the AI through a comprehensive 50 | 10-step research methodology: 51 | 52 | 1. **Topic Scoping & Domain Framework**: Creating a comprehensive structure to 53 | ensure complete coverage 54 | 2. **Initial Information Gathering**: Establishing baseline terminology and 55 | recent developments 56 | 3. **Focused & Frontier Retrieval**: Filling knowledge gaps and identifying 57 | cutting-edge developments 58 | 4. **Primary Trials Analysis**: Identifying and analyzing key clinical trials 59 | 5. **Primary Literature Analysis**: Identifying and analyzing pivotal 60 | publications 61 | 6. **Initial Evidence Synthesis**: Creating a preliminary framework of findings 62 | 7. **Integrated Gap-Filling**: Addressing identified knowledge gaps 63 | 8. **Comprehensive Evidence Synthesis**: Creating a final integrated framework 64 | with quality assessment 65 | 9. **Self-Critique and Verification**: Rigorously assessing the quality and 66 | comprehensiveness 67 | 10. **Research Brief Creation**: Producing the final deliverable with all 68 | required elements 69 | 70 | [](https://github.com/genomoncology/biomcp/blob/main/src/biomcp/resources/researcher.md) 71 | 72 | This structured approach ensures that no important aspects of the research 73 | question are overlooked and that the final output is comprehensive, 74 | well-organized, and backed by current evidence. 75 | 76 | ## Put to the Test: Emerging Treatment Strategies for Head and Neck Cancer 77 | 78 | To evaluate the effectiveness of the Researcher Persona, we conducted a 79 | head-to-head comparison with other AI research approaches. We asked the same 80 | question to five different systems: "What are the emerging treatment strategies 81 | for head and neck cancer?" 82 | 83 | The results were impressive. The BioMCP-powered Researcher Persona, combined 84 | with Claude's web search capabilities and the built-in think tool, 85 | produced the highest-rated research brief among all approaches tested. 86 | 87 | [](https://github.com/genomoncology/biomcp-examples#researcher-announcement) 88 | 89 | The research brief produced by the BioMCP Researcher Persona stood out for 90 | several reasons: 91 | 92 | 1. **Comprehensive domain coverage**: The report covered all relevant treatment 93 | modalities (immunotherapy, targeted therapy, radiation techniques, surgery, 94 | combination approaches) 95 | 2. **Structured evidence categorization**: Findings were clearly organized by 96 | level of evidence (Established, Emerging, Experimental, Theoretical) 97 | 3. **Evidence quality assessment**: The brief included critical evaluation of 98 | source quality and evidence strength 99 | 4. **Thorough citation**: All claims were backed by specific references to 100 | scientific literature or clinical trials 101 | 5. **Self-critique**: The report included transparent limitations and 102 | identified areas requiring further research 103 | 104 | ## Explore the Example and Evaluations 105 | 106 | We've documented this comparison in detail in 107 | the [biomcp-examples repository](https://github.com/genomoncology/biomcp-examples), 108 | where you can find: 109 | 110 | - The full research briefs produced by each approach 111 | - Independent evaluations by three different AI judges (Claude 3.7, Gemini 2.5 112 | Pro, and OpenAI o3) 113 | - Detailed scoring against a rubric that prioritizes accuracy, clarity, and 114 | comprehensiveness 115 | - Analysis of strengths and weaknesses of each approach 116 | 117 | The consensus among the judges placed the BioMCP-powered brief at the top, 118 | highlighting its exceptional structure, evidence-based approach, and 119 | comprehensive coverage. 120 | 121 | ## Beyond the Example: Wide-Ranging Applications 122 | 123 | While our example focused on head and neck cancer treatments, the BioMCP 124 | Researcher Persona can tackle a wide range of biomedical research questions: 125 | 126 | - **Therapeutic comparisons**: "Compare the efficacy and safety profiles of JAK 127 | inhibitors versus biologics for treating rheumatoid arthritis" 128 | - **Disease mechanisms**: "What is the current understanding of gut microbiome 129 | dysbiosis in inflammatory bowel disease?" 130 | - **Biomarker investigations**: "What emerging biomarkers show promise for 131 | early detection of pancreatic cancer?" 132 | - **Treatment protocols**: "What are the latest guidelines for managing 133 | anticoagulation in patients with atrial fibrillation and chronic kidney 134 | disease?" 135 | 136 | ## Join the BioMCP Community 137 | 138 | The Researcher Persona is just one example of how BioMCP is transforming 139 | AI-assisted biomedical research. We invite you to: 140 | 141 | 1. Try the Researcher Persona with your own research questions 142 | 2. Contribute to 143 | the [biomcp-examples repository](https://github.com/genomoncology/biomcp-examples) 144 | with your experiments 145 | 3. Share your feedback and suggestions for future improvements 146 | 147 | By combining specialized biomedical data access with structured research 148 | methodologies, BioMCP is helping researchers produce more comprehensive, 149 | accurate, and useful biomedical research briefs than ever before. 150 | 151 | Have a complex biomedical research question? Give the BioMCP Researcher Persona 152 | a try and experience the difference a structured, tool-powered approach can 153 | make! 154 | ``` -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- ```yaml 1 | site_name: BioMCP 2 | repo_url: https://github.com/genomoncology/biomcp 3 | site_url: https://biomcp.org/ 4 | site_description: Biomedical Model Context Protocol Server 5 | site_author: Ian Maurer 6 | edit_uri: edit/main/docs/ 7 | repo_name: genomoncology/biomcp 8 | copyright: Maintained by <a href="https://genomoncology.com">genomoncology</a>. 9 | 10 | nav: 11 | - Home: index.md 12 | 13 | - Getting Started: 14 | - Quick Start: getting-started/01-quickstart-cli.md 15 | - Claude Desktop: getting-started/02-claude-desktop-integration.md 16 | - API Keys: getting-started/03-authentication-and-api-keys.md 17 | - FAQ: faq-condensed.md 18 | - Troubleshooting: troubleshooting.md 19 | 20 | - User Guide: 21 | - Overview: concepts/01-what-is-biomcp.md 22 | - Finding Articles: how-to-guides/01-find-articles-and-cbioportal-data.md 23 | - Finding Trials: how-to-guides/02-find-trials-with-nci-and-biothings.md 24 | - Analyzing Variants: how-to-guides/03-get-comprehensive-variant-annotations.md 25 | - Predicting Effects: how-to-guides/04-predict-variant-effects-with-alphagenome.md 26 | - Searching Organizations: how-to-guides/06-search-nci-organizations-and-interventions.md 27 | - Research Workflows: workflows/all-workflows.md 28 | - Examples: 29 | - Pydantic AI Integration: tutorials/pydantic-ai-integration.md 30 | - Remote Connection: tutorials/remote-connection.md 31 | - BioThings Examples: tutorials/biothings-prompts.md 32 | - NCI Examples: tutorials/nci-prompts.md 33 | - AlphaGenome Tutorial: tutorials/claude-code-biomcp-alphagenome.md 34 | - OpenFDA Examples: tutorials/openfda-prompts.md 35 | - Concepts: 36 | - Deep Researcher: concepts/02-the-deep-researcher-persona.md 37 | - Sequential Thinking: concepts/03-sequential-thinking-with-the-think-tool.md 38 | 39 | - Reference: 40 | - Quick Reference: reference/quick-reference.md 41 | - CLI Commands: user-guides/01-command-line-interface.md 42 | - MCP Tools: user-guides/02-mcp-tools-reference.md 43 | - API Documentation: 44 | - API Overview: apis/overview.md 45 | - Python SDK: apis/python-sdk.md 46 | - Error Codes: apis/error-codes.md 47 | - IDE Integration: user-guides/03-integrating-with-ides-and-clients.md 48 | 49 | - Developer: 50 | - Architecture: 51 | - Overview: reference/quick-architecture.md 52 | - Visual Diagrams: reference/visual-architecture.md 53 | - Detailed Diagrams: reference/architecture-diagrams.md 54 | - Data Sources: 55 | - Overview: backend-services-reference/01-overview.md 56 | - PubTator3/PubMed: backend-services-reference/06-pubtator3.md 57 | - ClinicalTrials.gov: backend-services-reference/04-clinicaltrials-gov.md 58 | - NCI CTS API: backend-services-reference/05-nci-cts-api.md 59 | - BioThings Suite: backend-services-reference/02-biothings-suite.md 60 | - cBioPortal: backend-services-reference/03-cbioportal.md 61 | - AlphaGenome: backend-services-reference/07-alphagenome.md 62 | - OpenFDA: tutorials/openfda-integration.md 63 | - Development: 64 | - Contributing: developer-guides/02-contributing-and-testing.md 65 | - Deployment: developer-guides/01-server-deployment.md 66 | - BigQuery Monitoring: how-to-guides/05-logging-and-monitoring-with-bigquery.md 67 | - Technical Details: 68 | - Transport Protocol: developer-guides/04-transport-protocol.md 69 | - Error Handling: developer-guides/05-error-handling.md 70 | - HTTP Client: developer-guides/06-http-client-and-caching.md 71 | - Performance: developer-guides/07-performance-optimizations.md 72 | - Third-Party APIs: developer-guides/03-third-party-endpoints.md 73 | - Security: 74 | - FDA Integration Security: FDA_SECURITY.md 75 | 76 | - About: 77 | - Blog: 78 | - Clinical Trial Search: blog/ai-assisted-clinical-trial-search-analysis.md 79 | - Researcher Persona: blog/researcher-persona-resource.md 80 | - Project: 81 | - Changelog: changelog.md 82 | - Policies: policies.md 83 | - GenomOncology: genomoncology.md 84 | 85 | plugins: 86 | - search: 87 | lang: en 88 | separator: '[\s\-\.]+' 89 | - mkdocstrings: 90 | handlers: 91 | python: 92 | paths: ["src/biomcp"] 93 | # Note: sitemap plugin requires additional installation 94 | # Uncomment after installing: pip install mkdocs-sitemap 95 | # - sitemap: 96 | # changefreq: weekly 97 | # priority: 0.5 98 | theme: 99 | name: material 100 | # custom_dir: overrides 101 | favicon: assets/favicon.ico 102 | logo: assets/icon.png 103 | features: 104 | - navigation.tabs 105 | - navigation.tabs.sticky 106 | - navigation.sections 107 | - navigation.instant 108 | - navigation.tracking 109 | - navigation.top 110 | - toc.follow 111 | - search.suggest 112 | - search.highlight 113 | palette: 114 | - media: "(prefers-color-scheme: light)" 115 | scheme: default 116 | primary: white 117 | accent: deep orange 118 | toggle: 119 | icon: material/brightness-7 120 | name: Switch to dark mode 121 | - media: "(prefers-color-scheme: dark)" 122 | scheme: slate 123 | primary: black 124 | accent: deep orange 125 | toggle: 126 | icon: material/brightness-4 127 | name: Switch to light mode 128 | icon: 129 | repo: fontawesome/brands/github 130 | 131 | extra: 132 | social: 133 | - icon: fontawesome/brands/github 134 | link: https://github.com/genomoncology/biomcp 135 | - icon: fontawesome/brands/python 136 | link: https://pypi.org/project/biomcp-python 137 | meta: 138 | - property: og:type 139 | content: website 140 | - property: og:title 141 | content: BioMCP - Biomedical Model Context Protocol Server 142 | - property: og:description 143 | content: AI-powered biomedical research tool integrating PubMed, ClinicalTrials.gov, and genomic databases 144 | - property: og:image 145 | content: https://biomcp.org/assets/icon.png 146 | - property: og:url 147 | content: https://biomcp.org/ 148 | - name: twitter:card 149 | content: summary 150 | - name: twitter:title 151 | content: BioMCP - Biomedical Model Context Protocol 152 | - name: twitter:description 153 | content: AI-powered biomedical research tool for PubMed, clinical trials, and genomic data 154 | - name: keywords 155 | content: biomedical, MCP, AI, PubMed, clinical trials, genomics, bioinformatics, Claude Desktop 156 | 157 | extra_css: 158 | - stylesheets/extra.css 159 | - stylesheets/announcement.css 160 | 161 | # extra_javascript: (removed - no third-party dependencies) 162 | markdown_extensions: 163 | - toc: 164 | permalink: true 165 | - pymdownx.arithmatex: 166 | generic: true 167 | - admonition # Nice looking note/warning boxes 168 | - pymdownx.details # Collapsible sections 169 | - pymdownx.highlight: # Code highlighting 170 | anchor_linenums: true 171 | - pymdownx.inlinehilite 172 | - pymdownx.snippets # Include content from other files 173 | - pymdownx.superfences # Nested code blocks 174 | - pymdownx.tabbed: # Tabbed content 175 | alternate_style: true 176 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_getter.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for variant getter module.""" 2 | 3 | from unittest.mock import AsyncMock, patch 4 | 5 | import pytest 6 | 7 | from biomcp.constants import DEFAULT_ASSEMBLY 8 | from biomcp.variants import getter 9 | 10 | 11 | class TestGetVariant: 12 | """Test the get_variant function.""" 13 | 14 | @pytest.mark.asyncio 15 | async def test_get_variant_default_assembly(self): 16 | """Test that get_variant defaults to hg19 assembly.""" 17 | mock_response = { 18 | "_id": "rs113488022", 19 | "dbsnp": {"rsid": "rs113488022"}, 20 | } 21 | 22 | with patch("biomcp.http_client.request_api") as mock_request: 23 | mock_request.return_value = (mock_response, None) 24 | 25 | await getter.get_variant("rs113488022") 26 | 27 | # Verify assembly parameter was passed with default value 28 | call_args = mock_request.call_args 29 | assert call_args[1]["request"]["assembly"] == "hg19" 30 | 31 | @pytest.mark.asyncio 32 | async def test_get_variant_hg38_assembly(self): 33 | """Test that get_variant accepts hg38 assembly parameter.""" 34 | mock_response = { 35 | "_id": "rs113488022", 36 | "dbsnp": {"rsid": "rs113488022"}, 37 | } 38 | 39 | with patch("biomcp.http_client.request_api") as mock_request: 40 | mock_request.return_value = (mock_response, None) 41 | 42 | await getter.get_variant("rs113488022", assembly="hg38") 43 | 44 | # Verify assembly parameter was passed correctly 45 | call_args = mock_request.call_args 46 | assert call_args[1]["request"]["assembly"] == "hg38" 47 | 48 | @pytest.mark.asyncio 49 | async def test_get_variant_hg19_assembly(self): 50 | """Test that get_variant accepts hg19 assembly parameter explicitly.""" 51 | mock_response = { 52 | "_id": "rs113488022", 53 | "dbsnp": {"rsid": "rs113488022"}, 54 | } 55 | 56 | with patch("biomcp.http_client.request_api") as mock_request: 57 | mock_request.return_value = (mock_response, None) 58 | 59 | await getter.get_variant("rs113488022", assembly="hg19") 60 | 61 | # Verify assembly parameter was passed correctly 62 | call_args = mock_request.call_args 63 | assert call_args[1]["request"]["assembly"] == "hg19" 64 | 65 | @pytest.mark.asyncio 66 | async def test_get_variant_includes_all_fields(self): 67 | """Test that request includes all required fields.""" 68 | mock_response = {"_id": "rs113488022"} 69 | 70 | with patch("biomcp.http_client.request_api") as mock_request: 71 | mock_request.return_value = (mock_response, None) 72 | 73 | await getter.get_variant("rs113488022", assembly="hg38") 74 | 75 | # Verify both fields and assembly are in request 76 | call_args = mock_request.call_args 77 | request_params = call_args[1]["request"] 78 | assert "fields" in request_params 79 | assert request_params["fields"] == "all" 80 | assert "assembly" in request_params 81 | assert request_params["assembly"] == "hg38" 82 | 83 | @pytest.mark.asyncio 84 | async def test_get_variant_with_external_annotations(self): 85 | """Test that assembly parameter works with external annotations.""" 86 | from biomcp.variants.external import EnhancedVariantAnnotation 87 | 88 | mock_response = { 89 | "_id": "rs113488022", 90 | "dbsnp": {"rsid": "rs113488022"}, 91 | "dbnsfp": {"genename": "BRAF"}, 92 | } 93 | 94 | with ( 95 | patch("biomcp.http_client.request_api") as mock_request, 96 | patch( 97 | "biomcp.variants.getter.ExternalVariantAggregator" 98 | ) as mock_aggregator, 99 | ): 100 | mock_request.return_value = (mock_response, None) 101 | 102 | # Mock the aggregator with proper EnhancedVariantAnnotation 103 | mock_enhanced = EnhancedVariantAnnotation( 104 | variant_id="rs113488022", 105 | tcga=None, 106 | thousand_genomes=None, 107 | cbioportal=None, 108 | error_sources=[], 109 | ) 110 | 111 | mock_agg_instance = AsyncMock() 112 | mock_agg_instance.get_enhanced_annotations = AsyncMock( 113 | return_value=mock_enhanced 114 | ) 115 | mock_aggregator.return_value = mock_agg_instance 116 | 117 | await getter.get_variant( 118 | "rs113488022", 119 | assembly="hg38", 120 | include_external=True, 121 | ) 122 | 123 | # Verify assembly was still passed correctly 124 | call_args = mock_request.call_args 125 | assert call_args[1]["request"]["assembly"] == "hg38" 126 | 127 | 128 | class TestVariantDetailsMCPTool: 129 | """Test the _variant_details MCP tool.""" 130 | 131 | @pytest.mark.asyncio 132 | async def test_variant_details_default_assembly(self): 133 | """Test that _variant_details defaults to hg19 assembly.""" 134 | with patch("biomcp.variants.getter.get_variant") as mock_get: 135 | mock_get.return_value = "Variant details" 136 | 137 | await getter._variant_details( 138 | call_benefit="Testing default assembly", 139 | variant_id="rs113488022", 140 | ) 141 | 142 | # Verify get_variant was called with default assembly 143 | mock_get.assert_called_once_with( 144 | "rs113488022", 145 | output_json=False, 146 | include_external=True, 147 | assembly=DEFAULT_ASSEMBLY, 148 | ) 149 | 150 | @pytest.mark.asyncio 151 | async def test_variant_details_custom_assembly(self): 152 | """Test that _variant_details accepts custom assembly parameter.""" 153 | with patch("biomcp.variants.getter.get_variant") as mock_get: 154 | mock_get.return_value = "Variant details" 155 | 156 | await getter._variant_details( 157 | call_benefit="Testing hg38 assembly", 158 | variant_id="rs113488022", 159 | assembly="hg38", 160 | ) 161 | 162 | # Verify get_variant was called with hg38 163 | mock_get.assert_called_once_with( 164 | "rs113488022", 165 | output_json=False, 166 | include_external=True, 167 | assembly="hg38", 168 | ) 169 | 170 | @pytest.mark.asyncio 171 | async def test_variant_details_with_all_params(self): 172 | """Test that all parameters are passed through correctly.""" 173 | with patch("biomcp.variants.getter.get_variant") as mock_get: 174 | mock_get.return_value = "Variant details" 175 | 176 | await getter._variant_details( 177 | call_benefit="Testing all parameters", 178 | variant_id="chr7:g.140453136A>T", 179 | include_external=False, 180 | assembly="hg19", 181 | ) 182 | 183 | # Verify all params were passed 184 | mock_get.assert_called_once_with( 185 | "chr7:g.140453136A>T", 186 | output_json=False, 187 | include_external=False, 188 | assembly="hg19", 189 | ) 190 | ``` -------------------------------------------------------------------------------- /docs/developer-guides/04-transport-protocol.md: -------------------------------------------------------------------------------- ```markdown 1 | # Transport Protocol Guide 2 | 3 | This guide explains BioMCP's transport protocol options, with a focus on the new Streamable HTTP transport that provides better scalability and reliability for production deployments. 4 | 5 | ## Overview 6 | 7 | BioMCP supports multiple transport protocols to accommodate different deployment scenarios: 8 | 9 | | Transport | Use Case | Endpoint | Protocol Version | 10 | | ------------------- | -------------------------------------------- | -------- | ---------------- | 11 | | **STDIO** | Local development, direct Claude integration | N/A | All | 12 | | **Worker/SSE** | Legacy cloud deployments | `/sse` | Pre-2025 | 13 | | **Streamable HTTP** | Modern cloud deployments | `/mcp` | 2025-03-26+ | 14 | 15 | ## Streamable HTTP Transport 16 | 17 | ### What is Streamable HTTP? 18 | 19 | Streamable HTTP is the latest MCP transport protocol (specification version 2025-03-26) that provides: 20 | 21 | - **Single endpoint** (`/mcp`) for all operations 22 | - **Dynamic response modes**: JSON for quick operations, SSE for long-running tasks 23 | - **Session management** via `session_id` query parameter 24 | - **Better scalability**: No permanent connections required 25 | - **Automatic reconnection** and session recovery 26 | 27 | ### Architecture 28 | 29 | The Streamable HTTP transport follows this flow: 30 | 31 | 1. **MCP Client** sends POST request to `/mcp` endpoint 32 | 2. **BioMCP Server** processes the request 33 | 3. **Response Type** determined by operation: 34 | - Quick operations return JSON response 35 | - Long operations return SSE stream 36 | 4. **Session Management** maintains state via session_id parameter 37 | 38 | ### Implementation Details 39 | 40 | BioMCP leverages FastMCP's native streamable HTTP support: 41 | 42 | ```python 43 | # In core.py 44 | mcp_app = FastMCP( 45 | name="BioMCP", 46 | stateless_http=True, # Enables streamable HTTP 47 | ) 48 | ``` 49 | 50 | The transport is automatically handled by FastMCP 1.12.3+, providing: 51 | 52 | - Request routing 53 | - Session management 54 | - Response type negotiation 55 | - Error handling 56 | 57 | ## Migration Guide 58 | 59 | ### From SSE to Streamable HTTP 60 | 61 | If you're currently using the legacy SSE transport, migrate to streamable HTTP: 62 | 63 | #### 1. Update Server Configuration 64 | 65 | **Before (SSE/Worker mode):** 66 | 67 | ```bash 68 | biomcp run --mode worker 69 | ``` 70 | 71 | **After (Streamable HTTP):** 72 | 73 | ```bash 74 | biomcp run --mode streamable_http 75 | ``` 76 | 77 | #### 2. Update Client Configuration 78 | 79 | **MCP Inspector:** 80 | 81 | ```bash 82 | npx @modelcontextprotocol/inspector uv run --with . biomcp run --mode streamable_http 83 | ``` 84 | 85 | **Claude Desktop Configuration:** 86 | 87 | ```json 88 | { 89 | "mcpServers": { 90 | "biomcp": { 91 | "command": "docker", 92 | "args": [ 93 | "run", 94 | "-p", 95 | "8000:8000", 96 | "biomcp:latest", 97 | "biomcp", 98 | "run", 99 | "--mode", 100 | "streamable_http" 101 | ] 102 | } 103 | } 104 | } 105 | ``` 106 | 107 | #### 3. Update Cloudflare Worker 108 | 109 | The worker now supports both GET (legacy SSE) and POST (streamable HTTP) on the `/mcp` endpoint: 110 | 111 | ```javascript 112 | // Automatically routes based on method 113 | .get("/mcp", async (c) => { 114 | // Legacy SSE transport 115 | }) 116 | .post("/mcp", async (c) => { 117 | // Streamable HTTP transport 118 | }) 119 | ``` 120 | 121 | ### Backward Compatibility 122 | 123 | All legacy endpoints remain functional: 124 | 125 | - `/sse` - Server-sent events transport 126 | - `/health` - Health check endpoint 127 | - `/events` - Event streaming endpoint 128 | 129 | ## Configuration Options 130 | 131 | ### Server Modes 132 | 133 | ```bash 134 | # Local development (STDIO) 135 | biomcp run 136 | 137 | # Legacy SSE transport 138 | biomcp run --mode worker 139 | 140 | # Modern streamable HTTP 141 | biomcp run --mode streamable_http --host 0.0.0.0 --port 8000 142 | ``` 143 | 144 | ### Environment Variables 145 | 146 | | Variable | Description | Default | 147 | | --------------- | ----------------------- | ------- | 148 | | `MCP_TRANSPORT` | Override transport mode | None | 149 | | `MCP_HOST` | Server bind address | 0.0.0.0 | 150 | | `MCP_PORT` | Server port | 8000 | 151 | 152 | ## Session Management 153 | 154 | Streamable HTTP uses session IDs to maintain state across requests: 155 | 156 | ```http 157 | POST /mcp?session_id=abc123 HTTP/1.1 158 | Content-Type: application/json 159 | 160 | { 161 | "jsonrpc": "2.0", 162 | "method": "initialize", 163 | "params": {...} 164 | } 165 | ``` 166 | 167 | Sessions are: 168 | 169 | - Created automatically on first request 170 | - Maintained in server memory 171 | - Cleaned up after inactivity timeout 172 | - Isolated between different clients 173 | 174 | ## Performance Considerations 175 | 176 | ### Response Mode Selection 177 | 178 | The server automatically selects the optimal response mode: 179 | 180 | | Operation Type | Response Mode | Example | 181 | | ----------------- | ------------- | ---------------------- | 182 | | Quick queries | JSON | `search(limit=10)` | 183 | | Large results | SSE | `search(limit=1000)` | 184 | | Real-time updates | SSE | Thinking tool progress | 185 | 186 | ### Optimization Tips 187 | 188 | 1. **Use session IDs** for related requests to avoid re-initialization 189 | 2. **Batch operations** when possible to reduce round trips 190 | 3. **Set appropriate timeouts** for long-running operations 191 | 4. **Monitor response times** to identify bottlenecks 192 | 193 | ## Troubleshooting 194 | 195 | ### Common Issues 196 | 197 | #### 1. Connection Refused 198 | 199 | ``` 200 | Error: connect ECONNREFUSED 127.0.0.1:8000 201 | ``` 202 | 203 | **Solution**: Ensure server is running with `--host 0.0.0.0` for Docker deployments. 204 | 205 | #### 2. Session Not Found 206 | 207 | ``` 208 | Error: Session 'xyz' not found 209 | ``` 210 | 211 | **Solution**: Session may have expired. Omit session_id to create new session. 212 | 213 | #### 3. Timeout on Large Results 214 | 215 | ``` 216 | Error: Request timeout after 30s 217 | ``` 218 | 219 | **Solution**: Increase client timeout or reduce result size with `limit` parameter. 220 | 221 | ### Debug Mode 222 | 223 | Enable debug logging to troubleshoot transport issues: 224 | 225 | ```bash 226 | LOG_LEVEL=debug biomcp run --mode streamable_http 227 | ``` 228 | 229 | ## Security Considerations 230 | 231 | ### Authentication 232 | 233 | BioMCP does not implement authentication at the transport layer. Secure your deployment using: 234 | 235 | - **API Gateway**: AWS API Gateway, Kong, etc. 236 | - **Reverse Proxy**: Nginx with auth modules 237 | - **Cloud IAM**: Platform-specific access controls 238 | 239 | ### Rate Limiting 240 | 241 | Implement rate limiting at the infrastructure layer: 242 | 243 | ```nginx 244 | # Nginx example 245 | limit_req_zone $binary_remote_addr zone=mcp:10m rate=10r/s; 246 | 247 | location /mcp { 248 | limit_req zone=mcp burst=20; 249 | proxy_pass http://biomcp:8000; 250 | } 251 | ``` 252 | 253 | ### CORS Configuration 254 | 255 | For browser-based clients, configure CORS headers: 256 | 257 | ```python 258 | # Handled automatically by FastMCP when stateless_http=True 259 | ``` 260 | 261 | ## Monitoring 262 | 263 | ### Health Checks 264 | 265 | ```bash 266 | # Check server health 267 | curl http://localhost:8000/health 268 | 269 | # Response 270 | {"status": "ok", "transport": "streamable_http"} 271 | ``` 272 | 273 | ### Metrics 274 | 275 | Monitor these key metrics: 276 | 277 | - Request rate on `/mcp` endpoint 278 | - Response time percentiles (p50, p95, p99) 279 | - Session count and duration 280 | - Error rate by error type 281 | 282 | ## Next Steps 283 | 284 | - Review [MCP Specification](https://spec.modelcontextprotocol.io) for protocol details 285 | 286 | For questions or issues, please visit our [GitHub repository](https://github.com/genomoncology/biomcp). 287 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_europe_pmc_fetch.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for Europe PMC article fetching via DOI.""" 2 | 3 | import json 4 | from unittest.mock import Mock, patch 5 | 6 | import pytest 7 | 8 | from biomcp.articles.fetch import _article_details, is_doi, is_pmid 9 | from biomcp.articles.preprints import fetch_europe_pmc_article 10 | 11 | 12 | class TestDOIDetection: 13 | """Test DOI and PMID detection functions.""" 14 | 15 | def test_valid_dois(self): 16 | """Test that valid DOIs are correctly identified.""" 17 | valid_dois = [ 18 | "10.1101/2024.01.20.23288905", 19 | "10.1038/nature12373", 20 | "10.1016/j.cell.2023.05.001", 21 | "10.1126/science.abc1234", 22 | ] 23 | for doi in valid_dois: 24 | assert ( 25 | is_doi(doi) is True 26 | ), f"Expected {doi} to be identified as DOI" 27 | assert ( 28 | is_pmid(doi) is False 29 | ), f"Expected {doi} NOT to be identified as PMID" 30 | 31 | def test_valid_pmids(self): 32 | """Test that valid PMIDs are correctly identified.""" 33 | valid_pmids = [ 34 | "35271234", 35 | "12345678", 36 | "1", 37 | "999999999", 38 | ] 39 | for pmid in valid_pmids: 40 | assert ( 41 | is_pmid(pmid) is True 42 | ), f"Expected {pmid} to be identified as PMID" 43 | assert ( 44 | is_doi(pmid) is False 45 | ), f"Expected {pmid} NOT to be identified as DOI" 46 | 47 | def test_invalid_identifiers(self): 48 | """Test that invalid identifiers are rejected by both functions.""" 49 | invalid_ids = [ 50 | "PMC11193658", # PMC ID 51 | "abc123", # Random string 52 | "10.1101", # Incomplete DOI 53 | "nature12373", # DOI without prefix 54 | "", # Empty string 55 | ] 56 | for identifier in invalid_ids: 57 | assert ( 58 | is_doi(identifier) is False 59 | ), f"Expected {identifier} NOT to be identified as DOI" 60 | assert ( 61 | is_pmid(identifier) is False 62 | ), f"Expected {identifier} NOT to be identified as PMID" 63 | 64 | 65 | class TestEuropePMCFetch: 66 | """Test Europe PMC article fetching.""" 67 | 68 | @pytest.mark.asyncio 69 | async def test_fetch_europe_pmc_article_success(self): 70 | """Test successful fetch from Europe PMC.""" 71 | # Mock the response 72 | mock_response = Mock() 73 | mock_response.hitCount = 1 74 | mock_response.results = [ 75 | Mock( 76 | id="PPR790987", 77 | source="PPR", 78 | pmid=None, 79 | pmcid=None, 80 | doi="10.1101/2024.01.20.23288905", 81 | title="Test Article Title", 82 | authorString="Author A, Author B, Author C", 83 | journalTitle=None, 84 | pubYear="2024", 85 | firstPublicationDate="2024-01-23", 86 | abstractText="This is the abstract text.", 87 | ) 88 | ] 89 | 90 | with patch( 91 | "biomcp.articles.preprints.http_client.request_api" 92 | ) as mock_request: 93 | mock_request.return_value = (mock_response, None) 94 | 95 | result = await fetch_europe_pmc_article( 96 | "10.1101/2024.01.20.23288905", output_json=True 97 | ) 98 | data = json.loads(result) 99 | 100 | assert len(data) == 1 101 | article = data[0] 102 | assert article["doi"] == "10.1101/2024.01.20.23288905" 103 | assert article["title"] == "Test Article Title" 104 | assert article["journal"] == "Preprint Server (preprint)" 105 | assert article["date"] == "2024-01-23" 106 | assert article["authors"] == ["Author A", "Author B", "Author C"] 107 | assert article["abstract"] == "This is the abstract text." 108 | assert article["source"] == "Europe PMC" 109 | assert article["pmid"] is None 110 | assert "europepmc.org" in article["pmc_url"] 111 | 112 | @pytest.mark.asyncio 113 | async def test_fetch_europe_pmc_article_not_found(self): 114 | """Test fetch when article is not found in Europe PMC.""" 115 | mock_response = Mock() 116 | mock_response.hitCount = 0 117 | mock_response.results = [] 118 | 119 | with patch( 120 | "biomcp.articles.preprints.http_client.request_api" 121 | ) as mock_request: 122 | mock_request.return_value = (mock_response, None) 123 | 124 | result = await fetch_europe_pmc_article( 125 | "10.1101/invalid.doi", output_json=True 126 | ) 127 | data = json.loads(result) 128 | 129 | assert len(data) == 1 130 | assert data[0]["error"] == "Article not found in Europe PMC" 131 | 132 | @pytest.mark.asyncio 133 | async def test_fetch_europe_pmc_article_error(self): 134 | """Test fetch when Europe PMC API returns an error.""" 135 | mock_error = Mock() 136 | mock_error.code = 500 137 | mock_error.message = "Internal Server Error" 138 | 139 | with patch( 140 | "biomcp.articles.preprints.http_client.request_api" 141 | ) as mock_request: 142 | mock_request.return_value = (None, mock_error) 143 | 144 | result = await fetch_europe_pmc_article( 145 | "10.1101/2024.01.20.23288905", output_json=True 146 | ) 147 | data = json.loads(result) 148 | 149 | assert len(data) == 1 150 | assert data[0]["error"] == "Error 500: Internal Server Error" 151 | 152 | 153 | class TestArticleDetailsRouting: 154 | """Test that _article_details correctly routes DOIs to Europe PMC.""" 155 | 156 | @pytest.mark.asyncio 157 | async def test_doi_routes_to_europe_pmc(self): 158 | """Test that DOIs are routed to fetch_europe_pmc_article.""" 159 | test_doi = "10.1101/2024.01.20.23288905" 160 | 161 | with patch( 162 | "biomcp.articles.preprints.fetch_europe_pmc_article" 163 | ) as mock_europe_pmc: 164 | mock_europe_pmc.return_value = "Europe PMC result" 165 | 166 | result = await _article_details("Test", test_doi) 167 | 168 | mock_europe_pmc.assert_called_once_with(test_doi, output_json=True) 169 | assert result == "Europe PMC result" 170 | 171 | @pytest.mark.asyncio 172 | async def test_pmid_routes_to_pubtator(self): 173 | """Test that PMIDs are routed to fetch_articles.""" 174 | test_pmid = "35271234" 175 | 176 | with patch( 177 | "biomcp.articles.fetch.fetch_articles" 178 | ) as mock_fetch_articles: 179 | mock_fetch_articles.return_value = "PubTator result" 180 | 181 | result = await _article_details("Test", test_pmid) 182 | 183 | mock_fetch_articles.assert_called_once_with( 184 | [35271234], full=True, output_json=True 185 | ) 186 | assert result == "PubTator result" 187 | 188 | @pytest.mark.asyncio 189 | async def test_invalid_identifier_returns_error(self): 190 | """Test that invalid identifiers return an error.""" 191 | invalid_id = "PMC12345" 192 | 193 | result = await _article_details("Test", invalid_id) 194 | 195 | data = json.loads(result) 196 | assert len(data) == 1 197 | assert "Invalid identifier format" in data[0]["error"] 198 | assert "PMC12345" in data[0]["error"] 199 | ``` -------------------------------------------------------------------------------- /src/biomcp/workers/worker_entry.js: -------------------------------------------------------------------------------- ```javascript 1 | /** 2 | * BioMCP Worker – Auth‑less version (rev 1.8) 3 | * 4 | * Fix: Added improved error handling and increased timeouts for list requests 5 | */ 6 | 7 | // Server URL will be configured from environment variables 8 | let REMOTE_MCP_SERVER_URL = "http://localhost:8000"; // Default fallback 9 | const DEBUG = true; 10 | 11 | const log = (m) => DEBUG && console.log("[DEBUG]", m); 12 | const CORS = { 13 | "Access-Control-Allow-Origin": "*", 14 | "Access-Control-Allow-Methods": "GET, POST, OPTIONS", 15 | "Access-Control-Allow-Headers": "*", 16 | "Access-Control-Max-Age": "86400", 17 | }; 18 | const json = (o, s = 200) => 19 | new Response(JSON.stringify(o, null, 2), { 20 | status: s, 21 | headers: { "Content-Type": "application/json", ...CORS }, 22 | }); 23 | 24 | let forwardPath = "/messages"; // for proxying JSON‑RPC POSTS (no query) 25 | let resourceEndpoint = null; // full string we echo back (/messages/?sid=…) 26 | 27 | // Track active SSE connections to avoid duplicate connections 28 | const activeConnections = new Map(); 29 | 30 | export default { 31 | async fetch(req, env, ctx) { 32 | // Use environment variable if available, otherwise use the default 33 | REMOTE_MCP_SERVER_URL = env.REMOTE_MCP_SERVER_URL || REMOTE_MCP_SERVER_URL; 34 | 35 | const url = new URL(req.url); 36 | log(`${req.method} ${url.pathname}${url.search}`); 37 | 38 | if (req.method === "OPTIONS") 39 | return new Response(null, { status: 204, headers: CORS }); 40 | if (url.pathname === "/status" || url.pathname === "/debug") 41 | return json({ 42 | worker: "BioMCP-authless", 43 | remote: REMOTE_MCP_SERVER_URL, 44 | forwardPath, 45 | resourceEndpoint, 46 | }); 47 | if (url.pathname === "/sse" || url.pathname === "/events") 48 | return serveSSE(req, ctx); 49 | 50 | if (req.method === "POST") { 51 | const sid = url.searchParams.get("session_id"); 52 | if (!sid) return new Response("Missing session_id", { status: 400 }); 53 | return proxyPost(req, forwardPath, sid); 54 | } 55 | 56 | return new Response("Not found", { status: 404 }); 57 | }, 58 | }; 59 | 60 | async function proxyPost(req, path, sid) { 61 | const body = await req.text(); 62 | const target = `${REMOTE_MCP_SERVER_URL}${path}?session_id=${encodeURIComponent( 63 | sid, 64 | )}`; 65 | 66 | try { 67 | // Parse the request to check if it's a list request that might need a longer timeout 68 | let jsonBody; 69 | try { 70 | jsonBody = JSON.parse(body); 71 | } catch (e) { 72 | // Not valid JSON, proceed with normal request 73 | jsonBody = {}; 74 | } 75 | 76 | // Set a longer timeout for list requests that tend to time out 77 | const timeout = 78 | jsonBody.method && 79 | (jsonBody.method === "tools/list" || jsonBody.method === "resources/list") 80 | ? 30000 81 | : 10000; 82 | 83 | // Use AbortController to implement timeout 84 | const controller = new AbortController(); 85 | const timeoutId = setTimeout(() => controller.abort(), timeout); 86 | 87 | log(`Proxying ${jsonBody.method || "request"} with timeout ${timeout}ms`); 88 | 89 | const resp = await fetch(target, { 90 | method: "POST", 91 | headers: { "Content-Type": "application/json" }, 92 | body, 93 | signal: controller.signal, 94 | }); 95 | 96 | clearTimeout(timeoutId); 97 | 98 | // If it's a list request, cache the response for future use 99 | if ( 100 | jsonBody.method && 101 | (jsonBody.method === "tools/list" || jsonBody.method === "resources/list") 102 | ) { 103 | log(`Received response for ${jsonBody.method}`); 104 | } 105 | 106 | return new Response(await resp.text(), { 107 | status: resp.status, 108 | headers: { "Content-Type": "application/json", ...CORS }, 109 | }); 110 | } catch (error) { 111 | log(`POST error: ${error.message}`); 112 | 113 | // For timeout errors, provide a default empty response for list requests 114 | if (error.name === "AbortError") { 115 | try { 116 | const jsonBody = JSON.parse(body); 117 | if (jsonBody.method === "tools/list") { 118 | log("Returning empty tools list due to timeout"); 119 | return new Response( 120 | JSON.stringify({ 121 | jsonrpc: "2.0", 122 | id: jsonBody.id, 123 | result: { tools: [] }, 124 | }), 125 | { 126 | status: 200, 127 | headers: { "Content-Type": "application/json", ...CORS }, 128 | }, 129 | ); 130 | } else if (jsonBody.method === "resources/list") { 131 | log("Returning empty resources list due to timeout"); 132 | return new Response( 133 | JSON.stringify({ 134 | jsonrpc: "2.0", 135 | id: jsonBody.id, 136 | result: { resources: [] }, 137 | }), 138 | { 139 | status: 200, 140 | headers: { "Content-Type": "application/json", ...CORS }, 141 | }, 142 | ); 143 | } 144 | } catch (e) { 145 | // If parsing fails, fall through to default error response 146 | } 147 | } 148 | 149 | return new Response(JSON.stringify({ error: error.message }), { 150 | status: 502, 151 | headers: { "Content-Type": "application/json", ...CORS }, 152 | }); 153 | } 154 | } 155 | 156 | function serveSSE(clientReq, ctx) { 157 | const enc = new TextEncoder(); 158 | let keepalive; 159 | const upstreamCtl = new AbortController(); 160 | 161 | const stream = new ReadableStream({ 162 | async start(ctrl) { 163 | ctrl.enqueue(enc.encode("event: ready\ndata: {}\n\n")); 164 | 165 | clientReq.signal.addEventListener("abort", () => { 166 | clearInterval(keepalive); 167 | upstreamCtl.abort(); 168 | ctrl.close(); 169 | }); 170 | 171 | try { 172 | const u = await fetch(`${REMOTE_MCP_SERVER_URL}/sse`, { 173 | headers: { Accept: "text/event-stream" }, 174 | signal: upstreamCtl.signal, 175 | }); 176 | 177 | if (!u.ok || !u.body) throw new Error(`Upstream SSE ${u.status}`); 178 | const r = u.body.getReader(); 179 | 180 | while (true) { 181 | const { value, done } = await r.read(); 182 | if (done) break; 183 | if (value) { 184 | const text = new TextDecoder().decode(value); 185 | // capture first endpoint once 186 | if (!resourceEndpoint) { 187 | const m = text.match( 188 | /data:\s*(\/messages\/\?session_id=[A-Za-z0-9_-]+)/, 189 | ); 190 | if (m) { 191 | resourceEndpoint = m[1]; 192 | forwardPath = resourceEndpoint.split("?")[0]; 193 | log(`Captured endpoint ${resourceEndpoint}`); 194 | ctrl.enqueue( 195 | enc.encode(`event: resource\ndata: ${resourceEndpoint}\n\n`), 196 | ); 197 | } 198 | } 199 | ctrl.enqueue(value); 200 | } 201 | } 202 | } catch (e) { 203 | if (e.name !== "AbortError") { 204 | log(`SSE error: ${e.message}`); 205 | ctrl.enqueue(enc.encode(`event: error\ndata: ${e.message}\n\n`)); 206 | } 207 | } 208 | 209 | // Reduce keepalive interval to 5 seconds to prevent timeouts 210 | keepalive = setInterval(() => { 211 | try { 212 | ctrl.enqueue(enc.encode(":keepalive\n\n")); 213 | } catch (_) { 214 | clearInterval(keepalive); 215 | } 216 | }, 5000); 217 | }, 218 | }); 219 | 220 | return new Response(stream, { 221 | headers: { 222 | "Content-Type": "text/event-stream", 223 | "Cache-Control": "no-cache", 224 | Connection: "keep-alive", 225 | ...CORS, 226 | }, 227 | }); 228 | } 229 | ```