This is page 3 of 19. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── actions │ │ └── setup-python-env │ │ └── action.yml │ ├── dependabot.yml │ └── workflows │ ├── ci.yml │ ├── deploy-docs.yml │ ├── main.yml.disabled │ ├── on-release-main.yml │ └── validate-codecov-config.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BIOMCP_DATA_FLOW.md ├── CHANGELOG.md ├── CNAME ├── codecov.yaml ├── docker-compose.yml ├── Dockerfile ├── docs │ ├── apis │ │ ├── error-codes.md │ │ ├── overview.md │ │ └── python-sdk.md │ ├── assets │ │ ├── biomcp-cursor-locations.png │ │ ├── favicon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── mcp_architecture.txt │ │ └── remote-connection │ │ ├── 00_connectors.png │ │ ├── 01_add_custom_connector.png │ │ ├── 02_connector_enabled.png │ │ ├── 03_connect_to_biomcp.png │ │ ├── 04_select_google_oauth.png │ │ └── 05_success_connect.png │ ├── backend-services-reference │ │ ├── 01-overview.md │ │ ├── 02-biothings-suite.md │ │ ├── 03-cbioportal.md │ │ ├── 04-clinicaltrials-gov.md │ │ ├── 05-nci-cts-api.md │ │ ├── 06-pubtator3.md │ │ └── 07-alphagenome.md │ ├── blog │ │ ├── ai-assisted-clinical-trial-search-analysis.md │ │ ├── images │ │ │ ├── deep-researcher-video.png │ │ │ ├── researcher-announce.png │ │ │ ├── researcher-drop-down.png │ │ │ ├── researcher-prompt.png │ │ │ ├── trial-search-assistant.png │ │ │ └── what_is_biomcp_thumbnail.png │ │ └── researcher-persona-resource.md │ ├── changelog.md │ ├── CNAME │ ├── concepts │ │ ├── 01-what-is-biomcp.md │ │ ├── 02-the-deep-researcher-persona.md │ │ └── 03-sequential-thinking-with-the-think-tool.md │ ├── developer-guides │ │ ├── 01-server-deployment.md │ │ ├── 02-contributing-and-testing.md │ │ ├── 03-third-party-endpoints.md │ │ ├── 04-transport-protocol.md │ │ ├── 05-error-handling.md │ │ ├── 06-http-client-and-caching.md │ │ ├── 07-performance-optimizations.md │ │ └── generate_endpoints.py │ ├── faq-condensed.md │ ├── FDA_SECURITY.md │ ├── genomoncology.md │ ├── getting-started │ │ ├── 01-quickstart-cli.md │ │ ├── 02-claude-desktop-integration.md │ │ └── 03-authentication-and-api-keys.md │ ├── how-to-guides │ │ ├── 01-find-articles-and-cbioportal-data.md │ │ ├── 02-find-trials-with-nci-and-biothings.md │ │ ├── 03-get-comprehensive-variant-annotations.md │ │ ├── 04-predict-variant-effects-with-alphagenome.md │ │ ├── 05-logging-and-monitoring-with-bigquery.md │ │ └── 06-search-nci-organizations-and-interventions.md │ ├── index.md │ ├── policies.md │ ├── reference │ │ ├── architecture-diagrams.md │ │ ├── quick-architecture.md │ │ ├── quick-reference.md │ │ └── visual-architecture.md │ ├── robots.txt │ ├── stylesheets │ │ ├── announcement.css │ │ └── extra.css │ ├── troubleshooting.md │ ├── tutorials │ │ ├── biothings-prompts.md │ │ ├── claude-code-biomcp-alphagenome.md │ │ ├── nci-prompts.md │ │ ├── openfda-integration.md │ │ ├── openfda-prompts.md │ │ ├── pydantic-ai-integration.md │ │ └── remote-connection.md │ ├── user-guides │ │ ├── 01-command-line-interface.md │ │ ├── 02-mcp-tools-reference.md │ │ └── 03-integrating-with-ides-and-clients.md │ └── workflows │ └── all-workflows.md ├── example_scripts │ ├── mcp_integration.py │ └── python_sdk.py ├── glama.json ├── LICENSE ├── lzyank.toml ├── Makefile ├── mkdocs.yml ├── package-lock.json ├── package.json ├── pyproject.toml ├── README.md ├── scripts │ ├── check_docs_in_mkdocs.py │ ├── check_http_imports.py │ └── generate_endpoints_doc.py ├── smithery.yaml ├── src │ └── biomcp │ ├── __init__.py │ ├── __main__.py │ ├── articles │ │ ├── __init__.py │ │ ├── autocomplete.py │ │ ├── fetch.py │ │ ├── preprints.py │ │ ├── search_optimized.py │ │ ├── search.py │ │ └── unified.py │ ├── biomarkers │ │ ├── __init__.py │ │ └── search.py │ ├── cbioportal_helper.py │ ├── circuit_breaker.py │ ├── cli │ │ ├── __init__.py │ │ ├── articles.py │ │ ├── biomarkers.py │ │ ├── diseases.py │ │ ├── health.py │ │ ├── interventions.py │ │ ├── main.py │ │ ├── openfda.py │ │ ├── organizations.py │ │ ├── server.py │ │ ├── trials.py │ │ └── variants.py │ ├── connection_pool.py │ ├── constants.py │ ├── core.py │ ├── diseases │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── domain_handlers.py │ ├── drugs │ │ ├── __init__.py │ │ └── getter.py │ ├── exceptions.py │ ├── genes │ │ ├── __init__.py │ │ └── getter.py │ ├── http_client_simple.py │ ├── http_client.py │ ├── individual_tools.py │ ├── integrations │ │ ├── __init__.py │ │ ├── biothings_client.py │ │ └── cts_api.py │ ├── interventions │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── logging_filter.py │ ├── metrics_handler.py │ ├── metrics.py │ ├── openfda │ │ ├── __init__.py │ │ ├── adverse_events_helpers.py │ │ ├── adverse_events.py │ │ ├── cache.py │ │ ├── constants.py │ │ ├── device_events_helpers.py │ │ ├── device_events.py │ │ ├── drug_approvals.py │ │ ├── drug_labels_helpers.py │ │ ├── drug_labels.py │ │ ├── drug_recalls_helpers.py │ │ ├── drug_recalls.py │ │ ├── drug_shortages_detail_helpers.py │ │ ├── drug_shortages_helpers.py │ │ ├── drug_shortages.py │ │ ├── exceptions.py │ │ ├── input_validation.py │ │ ├── rate_limiter.py │ │ ├── utils.py │ │ └── validation.py │ ├── organizations │ │ ├── __init__.py │ │ ├── getter.py │ │ └── search.py │ ├── parameter_parser.py │ ├── prefetch.py │ ├── query_parser.py │ ├── query_router.py │ ├── rate_limiter.py │ ├── render.py │ ├── request_batcher.py │ ├── resources │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── instructions.md │ │ └── researcher.md │ ├── retry.py │ ├── router_handlers.py │ ├── router.py │ ├── shared_context.py │ ├── thinking │ │ ├── __init__.py │ │ ├── sequential.py │ │ └── session.py │ ├── thinking_tool.py │ ├── thinking_tracker.py │ ├── trials │ │ ├── __init__.py │ │ ├── getter.py │ │ ├── nci_getter.py │ │ ├── nci_search.py │ │ └── search.py │ ├── utils │ │ ├── __init__.py │ │ ├── cancer_types_api.py │ │ ├── cbio_http_adapter.py │ │ ├── endpoint_registry.py │ │ ├── gene_validator.py │ │ ├── metrics.py │ │ ├── mutation_filter.py │ │ ├── query_utils.py │ │ ├── rate_limiter.py │ │ └── request_cache.py │ ├── variants │ │ ├── __init__.py │ │ ├── alphagenome.py │ │ ├── cancer_types.py │ │ ├── cbio_external_client.py │ │ ├── cbioportal_mutations.py │ │ ├── cbioportal_search_helpers.py │ │ ├── cbioportal_search.py │ │ ├── constants.py │ │ ├── external.py │ │ ├── filters.py │ │ ├── getter.py │ │ ├── links.py │ │ └── search.py │ └── workers │ ├── __init__.py │ ├── worker_entry_stytch.js │ ├── worker_entry.js │ └── worker.py ├── tests │ ├── bdd │ │ ├── cli_help │ │ │ ├── help.feature │ │ │ └── test_help.py │ │ ├── conftest.py │ │ ├── features │ │ │ └── alphagenome_integration.feature │ │ ├── fetch_articles │ │ │ ├── fetch.feature │ │ │ └── test_fetch.py │ │ ├── get_trials │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── get_variants │ │ │ ├── get.feature │ │ │ └── test_get.py │ │ ├── search_articles │ │ │ ├── autocomplete.feature │ │ │ ├── search.feature │ │ │ ├── test_autocomplete.py │ │ │ └── test_search.py │ │ ├── search_trials │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ ├── search_variants │ │ │ ├── search.feature │ │ │ └── test_search.py │ │ └── steps │ │ └── test_alphagenome_steps.py │ ├── config │ │ └── test_smithery_config.py │ ├── conftest.py │ ├── data │ │ ├── ct_gov │ │ │ ├── clinical_trials_api_v2.yaml │ │ │ ├── trials_NCT04280705.json │ │ │ └── trials_NCT04280705.txt │ │ ├── myvariant │ │ │ ├── myvariant_api.yaml │ │ │ ├── myvariant_field_descriptions.csv │ │ │ ├── variants_full_braf_v600e.json │ │ │ ├── variants_full_braf_v600e.txt │ │ │ └── variants_part_braf_v600_multiple.json │ │ ├── openfda │ │ │ ├── drugsfda_detail.json │ │ │ ├── drugsfda_search.json │ │ │ ├── enforcement_detail.json │ │ │ └── enforcement_search.json │ │ └── pubtator │ │ ├── pubtator_autocomplete.json │ │ └── pubtator3_paper.txt │ ├── integration │ │ ├── test_openfda_integration.py │ │ ├── test_preprints_integration.py │ │ ├── test_simple.py │ │ └── test_variants_integration.py │ ├── tdd │ │ ├── articles │ │ │ ├── test_autocomplete.py │ │ │ ├── test_cbioportal_integration.py │ │ │ ├── test_fetch.py │ │ │ ├── test_preprints.py │ │ │ ├── test_search.py │ │ │ └── test_unified.py │ │ ├── conftest.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ └── test_drug_getter.py │ │ ├── openfda │ │ │ ├── __init__.py │ │ │ ├── test_adverse_events.py │ │ │ ├── test_device_events.py │ │ │ ├── test_drug_approvals.py │ │ │ ├── test_drug_labels.py │ │ │ ├── test_drug_recalls.py │ │ │ ├── test_drug_shortages.py │ │ │ └── test_security.py │ │ ├── test_biothings_integration_real.py │ │ ├── test_biothings_integration.py │ │ ├── test_circuit_breaker.py │ │ ├── test_concurrent_requests.py │ │ ├── test_connection_pool.py │ │ ├── test_domain_handlers.py │ │ ├── test_drug_approvals.py │ │ ├── test_drug_recalls.py │ │ ├── test_drug_shortages.py │ │ ├── test_endpoint_documentation.py │ │ ├── test_error_scenarios.py │ │ ├── test_europe_pmc_fetch.py │ │ ├── test_mcp_integration.py │ │ ├── test_mcp_tools.py │ │ ├── test_metrics.py │ │ ├── test_nci_integration.py │ │ ├── test_nci_mcp_tools.py │ │ ├── test_network_policies.py │ │ ├── test_offline_mode.py │ │ ├── test_openfda_unified.py │ │ ├── test_pten_r173_search.py │ │ ├── test_render.py │ │ ├── test_request_batcher.py.disabled │ │ ├── test_retry.py │ │ ├── test_router.py │ │ ├── test_shared_context.py.disabled │ │ ├── test_unified_biothings.py │ │ ├── thinking │ │ │ ├── __init__.py │ │ │ └── test_sequential.py │ │ ├── trials │ │ │ ├── test_backward_compatibility.py │ │ │ ├── test_getter.py │ │ │ └── test_search.py │ │ ├── utils │ │ │ ├── test_gene_validator.py │ │ │ ├── test_mutation_filter.py │ │ │ ├── test_rate_limiter.py │ │ │ └── test_request_cache.py │ │ ├── variants │ │ │ ├── constants.py │ │ │ ├── test_alphagenome_api_key.py │ │ │ ├── test_alphagenome_comprehensive.py │ │ │ ├── test_alphagenome.py │ │ │ ├── test_cbioportal_mutations.py │ │ │ ├── test_cbioportal_search.py │ │ │ ├── test_external_integration.py │ │ │ ├── test_external.py │ │ │ ├── test_extract_gene_aa_change.py │ │ │ ├── test_filters.py │ │ │ ├── test_getter.py │ │ │ ├── test_links.py │ │ │ └── test_search.py │ │ └── workers │ │ └── test_worker_sanitization.js │ └── test_pydantic_ai_integration.py ├── THIRD_PARTY_ENDPOINTS.md ├── tox.ini ├── uv.lock └── wrangler.toml ``` # Files -------------------------------------------------------------------------------- /src/biomcp/interventions/getter.py: -------------------------------------------------------------------------------- ```python 1 | """Get specific intervention details via NCI CTS API.""" 2 | 3 | import logging 4 | from typing import Any 5 | 6 | from ..constants import NCI_INTERVENTIONS_URL 7 | from ..integrations.cts_api import CTSAPIError, make_cts_request 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | async def get_intervention( 13 | intervention_id: str, 14 | api_key: str | None = None, 15 | ) -> dict[str, Any]: 16 | """ 17 | Get detailed information about a specific intervention. 18 | 19 | Args: 20 | intervention_id: Intervention ID 21 | api_key: Optional API key (if not provided, uses NCI_API_KEY env var) 22 | 23 | Returns: 24 | Dictionary with intervention details 25 | 26 | Raises: 27 | CTSAPIError: If the API request fails or intervention not found 28 | """ 29 | try: 30 | # Make API request 31 | url = f"{NCI_INTERVENTIONS_URL}/{intervention_id}" 32 | response = await make_cts_request( 33 | url=url, 34 | api_key=api_key, 35 | ) 36 | 37 | # Return the intervention data 38 | if "data" in response: 39 | return response["data"] 40 | elif "intervention" in response: 41 | return response["intervention"] 42 | else: 43 | return response 44 | 45 | except CTSAPIError: 46 | raise 47 | except Exception as e: 48 | logger.error(f"Failed to get intervention {intervention_id}: {e}") 49 | raise CTSAPIError(f"Failed to retrieve intervention: {e!s}") from e 50 | 51 | 52 | def _format_intervention_header(intervention: dict[str, Any]) -> list[str]: 53 | """Format intervention header and basic info.""" 54 | int_id = intervention.get( 55 | "id", intervention.get("intervention_id", "Unknown") 56 | ) 57 | name = intervention.get("name", "Unknown Intervention") 58 | int_type = intervention.get( 59 | "type", intervention.get("category", "Unknown") 60 | ) 61 | 62 | return [ 63 | f"## Intervention: {name}", 64 | "", 65 | "### Basic Information", 66 | f"- **ID**: {int_id}", 67 | f"- **Type**: {int_type}", 68 | ] 69 | 70 | 71 | def _format_intervention_synonyms(synonyms: Any) -> list[str]: 72 | """Format intervention synonyms section.""" 73 | if not synonyms: 74 | return [] 75 | 76 | lines = ["", "### Synonyms"] 77 | if isinstance(synonyms, list): 78 | for syn in synonyms: 79 | lines.append(f"- {syn}") 80 | else: 81 | lines.append(f"- {synonyms}") 82 | 83 | return lines 84 | 85 | 86 | def _format_intervention_regulatory(intervention: dict[str, Any]) -> list[str]: 87 | """Format regulatory information section.""" 88 | if not intervention.get("fda_approved"): 89 | return [] 90 | 91 | lines = [ 92 | "", 93 | "### Regulatory Status", 94 | f"- **FDA Approved**: {'Yes' if intervention['fda_approved'] else 'No'}", 95 | ] 96 | 97 | if intervention.get("approval_date"): 98 | lines.append(f"- **Approval Date**: {intervention['approval_date']}") 99 | 100 | return lines 101 | 102 | 103 | def _format_intervention_indications(indications: Any) -> list[str]: 104 | """Format clinical indications section.""" 105 | if not indications: 106 | return [] 107 | 108 | lines = ["", "### Clinical Indications"] 109 | if isinstance(indications, list): 110 | for indication in indications: 111 | lines.append(f"- {indication}") 112 | else: 113 | lines.append(f"- {indications}") 114 | 115 | return lines 116 | 117 | 118 | def format_intervention_details(intervention: dict[str, Any]) -> str: 119 | """ 120 | Format intervention details as markdown. 121 | 122 | Args: 123 | intervention: Intervention data dictionary 124 | 125 | Returns: 126 | Formatted markdown string 127 | """ 128 | lines = _format_intervention_header(intervention) 129 | 130 | # Add synonyms 131 | lines.extend( 132 | _format_intervention_synonyms(intervention.get("synonyms", [])) 133 | ) 134 | 135 | # Add description 136 | if intervention.get("description"): 137 | lines.extend([ 138 | "", 139 | "### Description", 140 | intervention["description"], 141 | ]) 142 | 143 | # Add mechanism of action for drugs 144 | if intervention.get("mechanism_of_action"): 145 | lines.extend([ 146 | "", 147 | "### Mechanism of Action", 148 | intervention["mechanism_of_action"], 149 | ]) 150 | 151 | # Add regulatory info 152 | lines.extend(_format_intervention_regulatory(intervention)) 153 | 154 | # Add clinical indications 155 | lines.extend( 156 | _format_intervention_indications(intervention.get("indications")) 157 | ) 158 | 159 | # Add related trials count if available 160 | if intervention.get("trial_count"): 161 | lines.extend([ 162 | "", 163 | "### Clinical Trial Activity", 164 | f"- **Number of Trials**: {intervention['trial_count']}", 165 | ]) 166 | 167 | return "\n".join(lines) 168 | ``` -------------------------------------------------------------------------------- /src/biomcp/thinking/session.py: -------------------------------------------------------------------------------- ```python 1 | """Session management for sequential thinking.""" 2 | 3 | import uuid 4 | from collections import defaultdict 5 | from dataclasses import dataclass, field 6 | from datetime import datetime 7 | from typing import Any 8 | 9 | 10 | @dataclass 11 | class ThoughtEntry: 12 | """Represents a single thought in the thinking process.""" 13 | 14 | thought: str 15 | thought_number: int 16 | total_thoughts: int 17 | next_thought_needed: bool 18 | timestamp: datetime = field(default_factory=datetime.now) 19 | is_revision: bool = False 20 | revises_thought: int | None = None 21 | branch_from_thought: int | None = None 22 | branch_id: str | None = None 23 | metadata: dict[str, Any] = field(default_factory=dict) 24 | 25 | 26 | @dataclass 27 | class ThinkingSession: 28 | """Manages state for a thinking session.""" 29 | 30 | session_id: str = field(default_factory=lambda: str(uuid.uuid4())) 31 | created_at: datetime = field(default_factory=datetime.now) 32 | thought_history: list[ThoughtEntry] = field(default_factory=list) 33 | thought_branches: dict[str, list[ThoughtEntry]] = field( 34 | default_factory=lambda: defaultdict(list) 35 | ) 36 | metadata: dict[str, Any] = field(default_factory=dict) 37 | 38 | def add_thought(self, entry: ThoughtEntry) -> None: 39 | """Add a thought to the session.""" 40 | # If this is a revision, replace the original thought 41 | if entry.is_revision and entry.revises_thought: 42 | for i, thought in enumerate(self.thought_history): 43 | if thought.thought_number == entry.revises_thought: 44 | self.thought_history[i] = entry 45 | return 46 | 47 | # Add to appropriate collection 48 | if entry.branch_id: 49 | self.thought_branches[entry.branch_id].append(entry) 50 | else: 51 | self.thought_history.append(entry) 52 | 53 | def get_thought(self, thought_number: int) -> ThoughtEntry | None: 54 | """Get a specific thought by number.""" 55 | for thought in self.thought_history: 56 | if thought.thought_number == thought_number: 57 | return thought 58 | return None 59 | 60 | def get_branch_thoughts(self, branch_id: str) -> list[ThoughtEntry]: 61 | """Get all thoughts in a specific branch.""" 62 | return self.thought_branches.get(branch_id, []) 63 | 64 | def get_all_thoughts(self) -> list[ThoughtEntry]: 65 | """Get all thoughts across main history and branches.""" 66 | all_thoughts = list(self.thought_history) 67 | for branch_thoughts in self.thought_branches.values(): 68 | all_thoughts.extend(branch_thoughts) 69 | return sorted(all_thoughts, key=lambda t: t.timestamp) 70 | 71 | 72 | class SessionManager: 73 | """Manages multiple thinking sessions.""" 74 | 75 | def __init__(self): 76 | self.sessions: dict[str, ThinkingSession] = {} 77 | self._current_session_id: str | None = None 78 | 79 | def create_session(self) -> ThinkingSession: 80 | """Create a new thinking session.""" 81 | session = ThinkingSession() 82 | self.sessions[session.session_id] = session 83 | self._current_session_id = session.session_id 84 | return session 85 | 86 | def get_session( 87 | self, session_id: str | None = None 88 | ) -> ThinkingSession | None: 89 | """Get a session by ID or the current session.""" 90 | if session_id: 91 | return self.sessions.get(session_id) 92 | elif self._current_session_id: 93 | return self.sessions.get(self._current_session_id) 94 | return None 95 | 96 | def get_or_create_session( 97 | self, session_id: str | None = None 98 | ) -> ThinkingSession: 99 | """Get existing session or create new one.""" 100 | if session_id and session_id in self.sessions: 101 | self._current_session_id = session_id 102 | return self.sessions[session_id] 103 | 104 | session = self.get_session() 105 | if not session: 106 | session = self.create_session() 107 | return session 108 | 109 | def clear_session(self, session_id: str | None = None) -> None: 110 | """Clear a specific session or the current session.""" 111 | if session_id: 112 | self.sessions.pop(session_id, None) 113 | if self._current_session_id == session_id: 114 | self._current_session_id = None 115 | elif self._current_session_id: 116 | self.sessions.pop(self._current_session_id, None) 117 | self._current_session_id = None 118 | 119 | def clear_all_sessions(self) -> None: 120 | """Clear all sessions.""" 121 | self.sessions.clear() 122 | self._current_session_id = None 123 | 124 | 125 | # Global session manager instance 126 | _session_manager = SessionManager() 127 | ``` -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main, develop] 6 | pull_request: 7 | branches: [main] 8 | workflow_dispatch: 9 | 10 | env: 11 | PYTHON_VERSION: "3.12" 12 | UV_VERSION: "0.4.29" 13 | 14 | jobs: 15 | # Quality check from main.yml - uses make check 16 | quality: 17 | runs-on: ubuntu-latest 18 | name: Quality 19 | steps: 20 | - name: Check out 21 | uses: actions/checkout@v5 22 | 23 | - uses: actions/cache@v4 24 | with: 25 | path: ~/.cache/pre-commit 26 | key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} 27 | 28 | - name: Set up Python 29 | uses: actions/setup-python@v6 30 | with: 31 | python-version: ${{ env.PYTHON_VERSION }} 32 | 33 | - name: Install uv 34 | uses: astral-sh/setup-uv@v7 35 | with: 36 | version: ${{ env.UV_VERSION }} 37 | 38 | - name: Install dependencies 39 | run: | 40 | uv sync --group dev 41 | 42 | - name: Run checks 43 | run: make check 44 | 45 | # Tests and type check specifically on Python 3.11 46 | tests-and-type-check: 47 | runs-on: ubuntu-latest 48 | name: Tests and Type Check (Python 3.11) 49 | steps: 50 | - name: Check out 51 | uses: actions/checkout@v5 52 | 53 | - name: Set up Python 54 | uses: actions/setup-python@v6 55 | with: 56 | python-version: "3.11" 57 | 58 | - name: Install uv 59 | uses: astral-sh/setup-uv@v7 60 | with: 61 | version: ${{ env.UV_VERSION }} 62 | 63 | - name: Install dependencies 64 | run: | 65 | uv sync --group dev 66 | 67 | - name: Run tests 68 | run: uv run python -m pytest tests -m "not integration" --cov --cov-config=pyproject.toml --cov-report=xml 69 | 70 | - name: Check typing 71 | run: uv run mypy 72 | 73 | - name: Upload coverage reports to Codecov with GitHub Action on Python 3.11 74 | uses: codecov/codecov-action@v5 75 | 76 | # Documentation check from main.yml 77 | check-docs: 78 | runs-on: ubuntu-latest 79 | name: Check Docs 80 | steps: 81 | - name: Check out 82 | uses: actions/checkout@v5 83 | 84 | - name: Set up Python 85 | uses: actions/setup-python@v6 86 | with: 87 | python-version: ${{ env.PYTHON_VERSION }} 88 | 89 | - name: Install uv 90 | uses: astral-sh/setup-uv@v7 91 | with: 92 | version: ${{ env.UV_VERSION }} 93 | 94 | - name: Install dependencies 95 | run: | 96 | uv sync --group dev 97 | 98 | - name: Check if documentation can be built 99 | run: uv run mkdocs build -s 100 | 101 | # Build package check 102 | build-package: 103 | runs-on: ubuntu-latest 104 | name: Build Package 105 | steps: 106 | - uses: actions/checkout@v5 107 | 108 | - name: Set up Python 109 | uses: actions/setup-python@v6 110 | with: 111 | python-version: ${{ env.PYTHON_VERSION }} 112 | 113 | - name: Install uv 114 | uses: astral-sh/setup-uv@v7 115 | with: 116 | version: ${{ env.UV_VERSION }} 117 | 118 | - name: Build package 119 | run: | 120 | uvx --from build pyproject-build --installer uv 121 | 122 | - name: Check package 123 | run: | 124 | uvx twine check dist/* 125 | 126 | - name: Upload artifacts 127 | uses: actions/upload-artifact@v4 128 | with: 129 | name: dist 130 | path: dist/ 131 | 132 | # MCP integration test - quick check 133 | test-mcp: 134 | runs-on: ubuntu-latest 135 | name: Test MCP Integration 136 | steps: 137 | - uses: actions/checkout@v5 138 | 139 | - name: Set up Python 140 | uses: actions/setup-python@v6 141 | with: 142 | python-version: ${{ env.PYTHON_VERSION }} 143 | 144 | - name: Install uv 145 | uses: astral-sh/setup-uv@v7 146 | with: 147 | version: ${{ env.UV_VERSION }} 148 | 149 | - name: Install dependencies 150 | run: | 151 | uv sync --group dev 152 | 153 | - name: Test MCP server startup 154 | run: | 155 | timeout 10s uv run biomcp run || code=$?; if [[ $code -ne 124 && $code -ne 0 ]]; then exit $code; fi 156 | 157 | - name: Run MCP integration tests 158 | run: | 159 | uv run python -m pytest tests/tdd/test_mcp_integration.py -v 160 | 161 | # Run integration tests separately - allowed to fail 162 | integration-tests: 163 | runs-on: ubuntu-latest 164 | name: Integration Tests (Optional) 165 | continue-on-error: true 166 | steps: 167 | - name: Check out 168 | uses: actions/checkout@v5 169 | 170 | - name: Set up Python 171 | uses: actions/setup-python@v6 172 | with: 173 | python-version: "3.11" 174 | 175 | - name: Install uv 176 | uses: astral-sh/setup-uv@v7 177 | with: 178 | version: ${{ env.UV_VERSION }} 179 | 180 | - name: Install dependencies 181 | run: | 182 | uv sync --group dev 183 | 184 | - name: Run integration tests 185 | run: | 186 | uv run python -m pytest tests -m "integration" -v --tb=short 187 | continue-on-error: true 188 | ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/03-cbioportal.md: -------------------------------------------------------------------------------- ```markdown 1 | # cBioPortal Integration 2 | 3 | BioMCP integrates with [cBioPortal](https://www.cbioportal.org/), a comprehensive cancer genomics portal that provides visualization and analysis tools for large-scale cancer genomics datasets. 4 | 5 | ## Overview 6 | 7 | The cBioPortal integration enhances article searches by automatically including relevant cancer genomics data when searching for genes. This integration provides: 8 | 9 | 1. **Gene-level summaries** - Mutation frequency and distribution across cancer studies 10 | 2. **Mutation-specific searches** - Find studies containing specific mutations (e.g., BRAF V600E) 11 | 3. **Cancer type resolution** - Accurate cancer type categorization using cBioPortal's API 12 | 13 | ## How It Works 14 | 15 | ### Automatic Integration 16 | 17 | When you search for articles with a gene parameter, BioMCP automatically queries cBioPortal to provide additional context: 18 | 19 | ```python 20 | # Basic gene search includes cBioPortal summary 21 | search(domain="article", genes=["BRAF"], diseases=["melanoma"]) 22 | ``` 23 | 24 | This returns: 25 | 26 | - Standard PubMed/PubTator3 article results 27 | - cBioPortal summary showing mutation frequency across cancer studies 28 | - Top cancer types where the gene is mutated 29 | 30 | ### Mutation-Specific Searches 31 | 32 | To search for specific mutations, include the mutation notation in keywords: 33 | 34 | ```python 35 | # Search for BRAF V600E mutation 36 | search(domain="article", genes=["BRAF"], keywords=["V600E"]) 37 | 38 | # Search for SRSF2 F57Y mutation 39 | search(domain="article", genes=["SRSF2"], keywords=["F57Y"]) 40 | 41 | # Use wildcards for mutation patterns (e.g., any amino acid at position 57) 42 | search(domain="article", genes=["SRSF2"], keywords=["F57*"]) 43 | ``` 44 | 45 | Mutation-specific searches return: 46 | 47 | - Total number of studies in cBioPortal 48 | - Number of studies containing the mutation 49 | - Top studies ranked by mutation count 50 | - Cancer type distribution 51 | 52 | ## Example Output 53 | 54 | ### Gene-Level Summary 55 | 56 | ``` 57 | ### cBioPortal Summary for BRAF 58 | - **Mutation Frequency**: 76.7% (368 mutations in 480 samples) 59 | - **Top Cancer Types**: Melanoma (45%), Thyroid (23%), Colorectal (18%) 60 | - **Top Mutations**: V600E (89%), V600K (7%), G469A (2%) 61 | ``` 62 | 63 | ### Mutation-Specific Results 64 | 65 | ``` 66 | ### cBioPortal Mutation Search: BRAF 67 | **Specific Mutation**: V600E 68 | - **Total Studies**: 2340 69 | - **Studies with Mutation**: 170 70 | - **Total Mutations Found**: 5780 71 | 72 | **Top Studies by Mutation Count:** 73 | | Count | Study ID | Cancer Type | Study Name | 74 | |-------|----------|-------------|------------| 75 | | 804 | msk_met_2021 | Mixed Cancer Types | MSK MetTropism (MSK, Cell 2021) | 76 | | 555 | msk_chord_2024 | Mixed Cancer Types | MSK-CHORD (MSK, Nature 2024) | 77 | | 295 | msk_impact_2017 | Mixed Cancer Types | MSK-IMPACT Clinical Sequencing Cohort | 78 | ``` 79 | 80 | ## Supported Mutation Notations 81 | 82 | The integration recognizes standard protein change notation: 83 | 84 | - **Specific mutations**: `V600E`, `F57Y`, `T790M` 85 | - **Wildcard patterns**: `F57*` (matches F57Y, F57L, etc.) 86 | - **Multiple mutations**: Include multiple keywords for OR search 87 | 88 | ## API Details 89 | 90 | ### Endpoints Used 91 | 92 | 1. **Gene Information**: `/api/genes/{gene}` 93 | 2. **Cancer Types**: `/api/cancer-types` 94 | 3. **Mutation Data**: `/api/mutations/fetch` 95 | 4. **Study Information**: `/api/studies` 96 | 97 | ### Rate Limiting 98 | 99 | - Conservative rate limit of 5 requests/second 100 | - Results cached for 15-30 minutes (mutations) or 24 hours (cancer types) 101 | 102 | ### Authentication 103 | 104 | Optional authentication via environment variable: 105 | 106 | ```bash 107 | export CBIO_TOKEN="your-api-token" 108 | ``` 109 | 110 | Public cBioPortal instance works without authentication but may have rate limits. 111 | 112 | ## CLI Usage 113 | 114 | For detailed command-line options for searching articles with cBioPortal integration, see the [CLI User Guide](../user-guides/01-command-line-interface.md#article-commands). 115 | 116 | ## Performance Considerations 117 | 118 | 1. **Caching**: Results are cached to minimize API calls 119 | 120 | - Gene summaries: 15 minutes 121 | - Mutation searches: 30 minutes 122 | - Cancer types: 24 hours 123 | 124 | 2. **Graceful Degradation**: If cBioPortal is unavailable, searches continue without the additional data 125 | 126 | 3. **Parallel Processing**: API calls are made in parallel with article searches for optimal performance 127 | 128 | ## Limitations 129 | 130 | 1. Only works with valid HUGO gene symbols 131 | 2. Mutation searches require exact protein change notation 132 | 3. Limited to mutations in cBioPortal's curated studies 133 | 4. Rate limits may apply for high-volume usage 134 | 135 | ## Error Handling 136 | 137 | The integration handles various error scenarios: 138 | 139 | - Invalid gene symbols are validated before API calls 140 | - Network timeouts fall back to article-only results 141 | - API errors are logged but don't block search results 142 | ``` -------------------------------------------------------------------------------- /src/biomcp/utils/cancer_types_api.py: -------------------------------------------------------------------------------- ```python 1 | """Cancer type utilities using cBioPortal API.""" 2 | 3 | import logging 4 | 5 | from ..utils.cbio_http_adapter import CBioHTTPAdapter 6 | from ..utils.request_cache import request_cache 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class CancerTypeAPIClient: 12 | """Client for fetching cancer types from cBioPortal API.""" 13 | 14 | def __init__(self): 15 | """Initialize the cancer type API client.""" 16 | self.http_adapter = CBioHTTPAdapter() 17 | # Cache for cancer types 18 | self._cancer_types_cache: dict[str, str] | None = None 19 | 20 | @request_cache(ttl=86400) # Cache for 24 hours 21 | async def get_all_cancer_types(self) -> dict[str, str]: 22 | """Fetch all cancer types from cBioPortal API. 23 | 24 | Returns: 25 | Dictionary mapping cancer type IDs to display names 26 | """ 27 | if self._cancer_types_cache is not None: 28 | return self._cancer_types_cache 29 | 30 | try: 31 | cancer_types, error = await self.http_adapter.get( 32 | "/cancer-types", 33 | endpoint_key="cbioportal_cancer_types", 34 | cache_ttl=86400, # 24 hours 35 | ) 36 | 37 | if error: 38 | logger.error(f"Failed to fetch cancer types: {error.message}") 39 | return {} 40 | 41 | if cancer_types: 42 | # Build mapping from ID to name 43 | result = {} 44 | for ct in cancer_types: 45 | cancer_type_id = ct.get("cancerTypeId", "") 46 | name = ct.get("name", "") 47 | 48 | if cancer_type_id and name: 49 | result[cancer_type_id.lower()] = name 50 | 51 | # Also add common abbreviations 52 | short_name = ct.get("shortName", "") 53 | if short_name and short_name != cancer_type_id: 54 | result[short_name.lower()] = name 55 | 56 | self._cancer_types_cache = result 57 | logger.info(f"Loaded {len(result)} cancer types from API") 58 | return result 59 | 60 | return {} 61 | 62 | except Exception as e: 63 | logger.error(f"Error fetching cancer types: {e}") 64 | return {} 65 | 66 | async def get_cancer_type_name(self, cancer_type_id: str) -> str: 67 | """Get the display name for a cancer type ID. 68 | 69 | Args: 70 | cancer_type_id: The cancer type identifier 71 | 72 | Returns: 73 | Display name or the original ID if not found 74 | """ 75 | if not cancer_type_id: 76 | return "Unknown" 77 | 78 | cancer_types = await self.get_all_cancer_types() 79 | 80 | # Try exact match (case-insensitive) 81 | normalized_id = cancer_type_id.lower() 82 | if normalized_id in cancer_types: 83 | return cancer_types[normalized_id] 84 | 85 | # If not found, return the original ID with title case 86 | if cancer_type_id == cancer_type_id.lower(): 87 | return cancer_type_id.title() 88 | return cancer_type_id 89 | 90 | @request_cache(ttl=3600) # Cache for 1 hour 91 | async def get_study_cancer_type(self, study_id: str) -> str: 92 | """Get cancer type for a specific study. 93 | 94 | Args: 95 | study_id: The study identifier 96 | 97 | Returns: 98 | Cancer type name or "Unknown" 99 | """ 100 | try: 101 | study_data, error = await self.http_adapter.get( 102 | f"/studies/{study_id}", 103 | endpoint_key="cbioportal_studies", 104 | cache_ttl=3600, # 1 hour 105 | ) 106 | 107 | if error or not study_data: 108 | logger.debug(f"Study {study_id} not found") 109 | return "Unknown" 110 | 111 | cancer_type_id = study_data.get("cancerType", {}).get( 112 | "cancerTypeId", "" 113 | ) 114 | 115 | if cancer_type_id and cancer_type_id != "unknown": 116 | return await self.get_cancer_type_name(cancer_type_id) 117 | 118 | # Fallback to the cancer type name directly 119 | cancer_type_name = study_data.get("cancerType", {}).get("name", "") 120 | if cancer_type_name: 121 | return cancer_type_name 122 | 123 | return "Unknown" 124 | 125 | except Exception as e: 126 | logger.debug(f"Error fetching study {study_id}: {e}") 127 | return "Unknown" 128 | 129 | 130 | # Global instance for reuse 131 | _cancer_type_client: CancerTypeAPIClient | None = None 132 | 133 | 134 | def get_cancer_type_client() -> CancerTypeAPIClient: 135 | """Get or create the global cancer type client.""" 136 | global _cancer_type_client 137 | if _cancer_type_client is None: 138 | _cancer_type_client = CancerTypeAPIClient() 139 | return _cancer_type_client 140 | ``` -------------------------------------------------------------------------------- /tests/tdd/utils/test_mutation_filter.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for mutation filter utility.""" 2 | 3 | from biomcp.utils.mutation_filter import MutationFilter 4 | 5 | 6 | class MockMutation: 7 | """Mock mutation object for testing.""" 8 | 9 | def __init__(self, protein_change: str): 10 | self.protein_change = protein_change 11 | 12 | 13 | class TestMutationFilter: 14 | """Test mutation filtering functionality.""" 15 | 16 | def test_specific_mutation_filter(self): 17 | """Test filtering for specific mutations.""" 18 | mutation_filter = MutationFilter(specific_mutation="V600E") 19 | 20 | assert mutation_filter.matches("V600E") 21 | assert not mutation_filter.matches("V600K") 22 | assert not mutation_filter.matches("V600") 23 | assert not mutation_filter.matches("") 24 | 25 | def test_wildcard_pattern_filter(self): 26 | """Test filtering with wildcard patterns.""" 27 | mutation_filter = MutationFilter(pattern="V600*") 28 | 29 | assert mutation_filter.matches("V600E") 30 | assert mutation_filter.matches("V600K") 31 | assert mutation_filter.matches("V600D") 32 | assert not mutation_filter.matches("V601E") 33 | assert not mutation_filter.matches("K600E") 34 | 35 | def test_pattern_without_wildcard(self): 36 | """Test pattern matching without wildcard.""" 37 | # Pattern does exact match via regex (no prefix matching without *) 38 | mutation_filter = MutationFilter(pattern="F57") 39 | 40 | # Exact match works 41 | assert mutation_filter.matches("F57") 42 | # No prefix matching without wildcard 43 | assert not mutation_filter.matches("F57Y") 44 | assert not mutation_filter.matches("F57L") 45 | assert not mutation_filter.matches("F58Y") 46 | 47 | def test_no_filter(self): 48 | """Test when no filter is specified.""" 49 | mutation_filter = MutationFilter() 50 | 51 | assert mutation_filter.matches("V600E") 52 | assert mutation_filter.matches("anything") 53 | # Empty protein change returns False even with no filter 54 | assert not mutation_filter.matches("") 55 | 56 | def test_filter_mutations_list(self): 57 | """Test filtering a list of mutations.""" 58 | mutations = [ 59 | MockMutation("V600E"), 60 | MockMutation("V600K"), 61 | MockMutation("V600D"), 62 | MockMutation("T790M"), 63 | MockMutation("L858R"), 64 | ] 65 | 66 | # Test specific mutation 67 | mutation_filter1 = MutationFilter(specific_mutation="V600E") 68 | filtered1 = mutation_filter1.filter_mutations(mutations) 69 | assert len(filtered1) == 1 70 | assert filtered1[0].protein_change == "V600E" 71 | 72 | # Test pattern 73 | mutation_filter2 = MutationFilter(pattern="V600*") 74 | filtered2 = mutation_filter2.filter_mutations(mutations) 75 | assert len(filtered2) == 3 76 | assert all(m.protein_change.startswith("V600") for m in filtered2) 77 | 78 | # Test no filter 79 | mutation_filter3 = MutationFilter() 80 | filtered3 = mutation_filter3.filter_mutations(mutations) 81 | assert len(filtered3) == 5 82 | 83 | def test_string_representations(self): 84 | """Test string representations of filters.""" 85 | mutation_filter1 = MutationFilter(specific_mutation="V600E") 86 | assert str(mutation_filter1) == "MutationFilter(specific=V600E)" 87 | assert ( 88 | repr(mutation_filter1) 89 | == "MutationFilter(specific_mutation='V600E', pattern=None)" 90 | ) 91 | 92 | mutation_filter2 = MutationFilter(pattern="V600*") 93 | assert str(mutation_filter2) == "MutationFilter(pattern=V600*)" 94 | 95 | mutation_filter3 = MutationFilter() 96 | assert str(mutation_filter3) == "MutationFilter(no_filter)" 97 | 98 | def test_edge_cases(self): 99 | """Test edge cases in mutation matching.""" 100 | # Empty protein change 101 | mutation_filter = MutationFilter(specific_mutation="V600E") 102 | assert not mutation_filter.matches("") 103 | assert not mutation_filter.matches(None) 104 | 105 | # Complex patterns 106 | mutation_filter2 = MutationFilter(pattern="[VL]600*") 107 | # This will use regex escaping, so won't work as expected 108 | # But should not crash 109 | assert not mutation_filter2.matches("V600E") # Because [ is escaped 110 | 111 | def test_filter_mutations_preserves_type(self): 112 | """Test that filter preserves the original list type.""" 113 | mutations = [ 114 | MockMutation("V600E"), 115 | MockMutation("V600K"), 116 | ] 117 | 118 | mutation_filter = MutationFilter(pattern="V600*") 119 | result = mutation_filter.filter_mutations(mutations) 120 | 121 | # Result should be a list 122 | assert isinstance(result, list) 123 | assert len(result) == 2 124 | ``` -------------------------------------------------------------------------------- /src/biomcp/variants/getter.py: -------------------------------------------------------------------------------- ```python 1 | """Getter module for retrieving variant details.""" 2 | 3 | import json 4 | import logging 5 | from typing import Annotated 6 | 7 | from .. import ensure_list, http_client, render 8 | from ..constants import DEFAULT_ASSEMBLY, MYVARIANT_GET_URL 9 | from .external import ExternalVariantAggregator, format_enhanced_annotations 10 | from .filters import filter_variants 11 | from .links import inject_links 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | async def get_variant( 17 | variant_id: str, 18 | output_json: bool = False, 19 | include_external: bool = False, 20 | assembly: str = DEFAULT_ASSEMBLY, 21 | ) -> str: 22 | """ 23 | Get variant details from MyVariant.info using the variant identifier. 24 | 25 | The identifier can be a full HGVS-style string (e.g. "chr7:g.140453136A>T") 26 | or an rsID (e.g. "rs113488022"). The API response is expected to include a 27 | "hits" array; this function extracts the first hit. 28 | 29 | Args: 30 | variant_id: Variant identifier (HGVS or rsID) 31 | output_json: Return JSON format if True, else Markdown 32 | include_external: Include external annotations (TCGA, 1000 Genomes, cBioPortal) 33 | assembly: Genome assembly (hg19 or hg38), defaults to hg19 34 | 35 | Returns: 36 | Formatted variant data as JSON or Markdown string 37 | 38 | If output_json is True, the result is returned as a formatted JSON string; 39 | otherwise, it is rendered as Markdown. 40 | """ 41 | response, error = await http_client.request_api( 42 | url=f"{MYVARIANT_GET_URL}/{variant_id}", 43 | request={"fields": "all", "assembly": assembly}, 44 | method="GET", 45 | domain="myvariant", 46 | ) 47 | 48 | data_to_return: list = ensure_list(response) 49 | 50 | # Inject database links into the variant data 51 | if not error: 52 | data_to_return = inject_links(data_to_return) 53 | data_to_return = filter_variants(data_to_return) 54 | 55 | # Add external annotations if requested 56 | if include_external and data_to_return: 57 | logger.info( 58 | f"Adding external annotations for {len(data_to_return)} variants" 59 | ) 60 | aggregator = ExternalVariantAggregator() 61 | 62 | for _i, variant_data in enumerate(data_to_return): 63 | logger.info( 64 | f"Processing variant {_i}: keys={list(variant_data.keys())}" 65 | ) 66 | # Get enhanced annotations 67 | enhanced = await aggregator.get_enhanced_annotations( 68 | variant_id, 69 | include_tcga=True, 70 | include_1000g=True, 71 | include_cbioportal=True, 72 | variant_data=variant_data, 73 | ) 74 | 75 | # Add formatted annotations to the variant data 76 | formatted = format_enhanced_annotations(enhanced) 77 | logger.info( 78 | f"Formatted external annotations: {formatted['external_annotations'].keys()}" 79 | ) 80 | variant_data.update(formatted["external_annotations"]) 81 | 82 | if error: 83 | data_to_return = [{"error": f"Error {error.code}: {error.message}"}] 84 | 85 | if output_json: 86 | return json.dumps(data_to_return, indent=2) 87 | else: 88 | return render.to_markdown(data_to_return) 89 | 90 | 91 | async def _variant_details( 92 | call_benefit: Annotated[ 93 | str, 94 | "Define and summarize why this function is being called and the intended benefit", 95 | ], 96 | variant_id: str, 97 | include_external: Annotated[ 98 | bool, 99 | "Include annotations from external sources (TCGA, 1000 Genomes, cBioPortal)", 100 | ] = True, 101 | assembly: Annotated[ 102 | str, 103 | "Genome assembly (hg19 or hg38). Default: hg19", 104 | ] = DEFAULT_ASSEMBLY, 105 | ) -> str: 106 | """ 107 | Retrieves detailed information for a *single* genetic variant. 108 | 109 | Parameters: 110 | - call_benefit: Define and summarize why this function is being called and the intended benefit 111 | - variant_id: A variant identifier ("chr7:g.140453136A>T") 112 | - include_external: Include annotations from TCGA, 1000 Genomes, cBioPortal, and Mastermind 113 | - assembly: Genome assembly (hg19 or hg38). Default: hg19 114 | 115 | Process: Queries the MyVariant.info GET endpoint, optionally fetching 116 | additional annotations from external databases 117 | Output: A Markdown formatted string containing comprehensive 118 | variant annotations (genomic context, frequencies, 119 | predictions, clinical data, external annotations). Returns error if invalid. 120 | Note: Use the variant_searcher to find the variant id first. 121 | """ 122 | return await get_variant( 123 | variant_id, 124 | output_json=False, 125 | include_external=include_external, 126 | assembly=assembly, 127 | ) 128 | ``` -------------------------------------------------------------------------------- /src/biomcp/integrations/cts_api.py: -------------------------------------------------------------------------------- ```python 1 | """NCI Clinical Trials Search API integration helper.""" 2 | 3 | import json 4 | import logging 5 | import os 6 | from typing import Any, Literal 7 | 8 | from ..constants import NCI_API_KEY_ENV 9 | from ..http_client import request_api 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class CTSAPIError(Exception): 15 | """Error raised when CTS API requests fail.""" 16 | 17 | pass 18 | 19 | 20 | def _validate_api_key(api_key: str | None) -> str: 21 | """Validate and return API key.""" 22 | if not api_key: 23 | api_key = os.getenv(NCI_API_KEY_ENV) 24 | 25 | if not api_key: 26 | raise CTSAPIError( 27 | f"NCI API key required. Please set {NCI_API_KEY_ENV} environment " 28 | "variable or provide api_key parameter.\n" 29 | "Get a free API key at: https://www.cancer.gov/research/participate/" 30 | "clinical-trials-search/developers" 31 | ) 32 | 33 | return api_key 34 | 35 | 36 | def _prepare_request_data( 37 | method: str, 38 | params: dict[str, Any] | None, 39 | json_data: dict[str, Any] | None, 40 | headers: dict[str, str], 41 | ) -> dict[str, Any]: 42 | """Prepare request data based on method.""" 43 | if method == "GET": 44 | request_data = params or {} 45 | logger.debug(f"CTS API GET request with params: {params}") 46 | else: 47 | request_data = json_data or {} 48 | if method == "POST": 49 | logger.debug(f"CTS API POST request with data: {json_data}") 50 | 51 | # Add headers to request data 52 | if headers: 53 | request_data["_headers"] = json.dumps(headers) 54 | 55 | return request_data 56 | 57 | 58 | def _handle_api_error(error: Any) -> None: 59 | """Handle API errors with appropriate messages.""" 60 | if error.code == 401: 61 | raise CTSAPIError( 62 | f"Invalid API key. Please check your {NCI_API_KEY_ENV} " 63 | "environment variable or api_key parameter." 64 | ) 65 | elif error.code == 403: 66 | raise CTSAPIError( 67 | "Access forbidden. Your API key may not have permission " 68 | "to access this resource." 69 | ) 70 | else: 71 | raise CTSAPIError(f"CTS API error: {error.message}") 72 | 73 | 74 | async def make_cts_request( 75 | url: str, 76 | method: Literal["GET", "POST"] = "GET", 77 | params: dict[str, Any] | None = None, 78 | json_data: dict[str, Any] | None = None, 79 | api_key: str | None = None, 80 | ) -> dict[str, Any]: 81 | """ 82 | Make a request to the NCI CTS API with proper authentication. 83 | 84 | Args: 85 | url: Full URL to the CTS API endpoint 86 | method: HTTP method (GET or POST) 87 | params: Query parameters 88 | json_data: JSON data for POST requests 89 | api_key: Optional API key (if not provided, uses NCI_API_KEY env var) 90 | 91 | Returns: 92 | JSON response from the API 93 | 94 | Raises: 95 | CTSAPIError: If the request fails or API key is missing 96 | """ 97 | # Validate API key 98 | api_key = _validate_api_key(api_key) 99 | 100 | # Prepare headers 101 | headers = {"x-api-key": api_key, "Accept": "application/json"} 102 | 103 | try: 104 | # Prepare request data 105 | request_data = _prepare_request_data( 106 | method, params, json_data, headers 107 | ) 108 | 109 | # Make API request 110 | response, error = await request_api( 111 | url=url, 112 | request=request_data, 113 | method=method, 114 | cache_ttl=0, # Disable caching for NCI API to ensure fresh results 115 | ) 116 | 117 | # Handle errors 118 | if error: 119 | _handle_api_error(error) 120 | 121 | if response is None: 122 | raise CTSAPIError("No response received from NCI CTS API") 123 | 124 | return response 125 | 126 | except Exception as e: 127 | # Re-raise CTSAPIError as-is 128 | if isinstance(e, CTSAPIError): 129 | raise 130 | 131 | # Wrap other exceptions 132 | logger.error(f"CTS API request failed: {e}") 133 | raise CTSAPIError(f"Failed to connect to NCI CTS API: {e!s}") from e 134 | 135 | 136 | def get_api_key_instructions() -> str: 137 | """ 138 | Get user-friendly instructions for obtaining and setting the API key. 139 | 140 | Returns: 141 | Formatted string with instructions 142 | """ 143 | return ( 144 | "## NCI Clinical Trials API Key Required\n\n" 145 | "To use NCI's Clinical Trials Search API, you need an API key.\n\n" 146 | "**Option 1: Set environment variable (recommended)**\n" 147 | "```bash\n" 148 | f"export {NCI_API_KEY_ENV}='your-api-key'\n" 149 | "```\n\n" 150 | "**Option 2: Provide via CLI**\n" 151 | "```bash\n" 152 | "biomcp trial search --api-key YOUR_KEY --condition melanoma\n" 153 | "```\n\n" 154 | "**Get your free API key:**\n" 155 | "Visit https://www.cancer.gov/research/participate/clinical-trials-search/developers\n\n" 156 | "The API key provides access to NCI's comprehensive cancer clinical trials " 157 | "database with advanced search capabilities." 158 | ) 159 | ``` -------------------------------------------------------------------------------- /tests/tdd/variants/test_alphagenome_api_key.py: -------------------------------------------------------------------------------- ```python 1 | """Test AlphaGenome per-request API key functionality.""" 2 | 3 | import os 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | from biomcp.variants.alphagenome import predict_variant_effects 10 | 11 | 12 | @pytest.mark.asyncio 13 | async def test_api_key_parameter_overrides_env_var(): 14 | """Test that api_key parameter takes precedence over environment variable.""" 15 | # Set up environment variable 16 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "env-key"}): 17 | # Mock AlphaGenome modules 18 | mock_genome = MagicMock() 19 | mock_client = MagicMock() 20 | mock_scorers = MagicMock() 21 | 22 | # Mock successful prediction 23 | test_scores_df = pd.DataFrame({ 24 | "output_type": ["RNA_SEQ"], 25 | "raw_score": [1.5], 26 | "gene_name": ["BRAF"], 27 | "track_name": [None], 28 | }) 29 | 30 | # Track which API key was used 31 | api_keys_used = [] 32 | 33 | def track_create(api_key): 34 | api_keys_used.append(api_key) 35 | mock_model = MagicMock() 36 | mock_model.score_variant.return_value = test_scores_df 37 | return mock_model 38 | 39 | mock_client.create.side_effect = track_create 40 | 41 | mock_scorers.tidy_scores.return_value = test_scores_df 42 | mock_scorers.get_recommended_scorers.return_value = [] 43 | 44 | # Create a mock module with the correct attributes 45 | mock_models = MagicMock() 46 | mock_models.dna_client = mock_client 47 | mock_models.variant_scorers = mock_scorers 48 | 49 | mock_data = MagicMock() 50 | mock_data.genome = mock_genome 51 | 52 | with patch.dict( 53 | "sys.modules", 54 | { 55 | "alphagenome.data": mock_data, 56 | "alphagenome.models": mock_models, 57 | }, 58 | ): 59 | # Test with parameter API key 60 | result = await predict_variant_effects( 61 | "chr7", 140753336, "A", "T", api_key="param-key" 62 | ) 63 | 64 | # Verify the parameter key was used, not the env var 65 | assert len(api_keys_used) == 1 66 | assert api_keys_used[0] == "param-key" 67 | assert "BRAF" in result 68 | 69 | 70 | @pytest.mark.asyncio 71 | async def test_no_api_key_shows_instructions(): 72 | """Test that missing API key shows helpful instructions.""" 73 | # Ensure no environment variable is set 74 | with patch.dict("os.environ", {}, clear=True): 75 | # Remove ALPHAGENOME_API_KEY if it exists 76 | os.environ.pop("ALPHAGENOME_API_KEY", None) 77 | 78 | result = await predict_variant_effects( 79 | "chr7", 140753336, "A", "T", skip_cache=True 80 | ) 81 | 82 | # Check for instructions 83 | assert "AlphaGenome API key required" in result 84 | assert "My AlphaGenome API key is" in result 85 | assert "ACTION REQUIRED" in result 86 | assert "https://deepmind.google.com/science/alphagenome" in result 87 | 88 | 89 | @pytest.mark.asyncio 90 | async def test_env_var_used_when_no_parameter(): 91 | """Test that environment variable is used when no parameter is provided.""" 92 | # Set up environment variable 93 | with patch.dict("os.environ", {"ALPHAGENOME_API_KEY": "env-key"}): 94 | # Mock AlphaGenome modules 95 | mock_genome = MagicMock() 96 | mock_client = MagicMock() 97 | mock_scorers = MagicMock() 98 | 99 | # Mock successful prediction 100 | test_scores_df = pd.DataFrame({ 101 | "output_type": ["RNA_SEQ"], 102 | "raw_score": [1.5], 103 | "gene_name": ["BRAF"], 104 | "track_name": [None], 105 | }) 106 | 107 | # Track which API key was used 108 | api_keys_used = [] 109 | 110 | def track_create(api_key): 111 | api_keys_used.append(api_key) 112 | mock_model = MagicMock() 113 | mock_model.score_variant.return_value = test_scores_df 114 | return mock_model 115 | 116 | mock_client.create.side_effect = track_create 117 | 118 | mock_scorers.tidy_scores.return_value = test_scores_df 119 | mock_scorers.get_recommended_scorers.return_value = [] 120 | 121 | # Create a mock module with the correct attributes 122 | mock_models = MagicMock() 123 | mock_models.dna_client = mock_client 124 | mock_models.variant_scorers = mock_scorers 125 | 126 | mock_data = MagicMock() 127 | mock_data.genome = mock_genome 128 | 129 | with patch.dict( 130 | "sys.modules", 131 | { 132 | "alphagenome.data": mock_data, 133 | "alphagenome.models": mock_models, 134 | }, 135 | ): 136 | # Test without parameter API key 137 | result = await predict_variant_effects("chr7", 140753336, "A", "T") 138 | 139 | # Verify the env var key was used 140 | assert len(api_keys_used) == 1 141 | assert api_keys_used[0] == "env-key" 142 | assert "BRAF" in result 143 | ``` -------------------------------------------------------------------------------- /src/biomcp/request_batcher.py: -------------------------------------------------------------------------------- ```python 1 | """Request batching utility for combining multiple small requests. 2 | 3 | This module provides a request batcher that accumulates multiple requests 4 | and processes them together in batches, reducing the number of API calls 5 | and improving performance for bulk operations. 6 | 7 | Key Features: 8 | - Automatic batching based on size or time threshold 9 | - Configurable batch size and timeout 10 | - Thread-safe request accumulation 11 | - Error propagation to individual requests 12 | 13 | Example: 14 | ```python 15 | async def batch_api_call(params_list): 16 | # Make a single API call with multiple parameters 17 | return await api.bulk_request(params_list) 18 | 19 | batcher = RequestBatcher( 20 | batch_func=batch_api_call, 21 | batch_size=10, 22 | batch_timeout=0.1 23 | ) 24 | 25 | # Individual requests are automatically batched 26 | result1 = await batcher.request({"id": 1}) 27 | result2 = await batcher.request({"id": 2}) 28 | ``` 29 | """ 30 | 31 | import asyncio 32 | from collections.abc import Callable, Coroutine 33 | from typing import Any, TypeVar 34 | 35 | T = TypeVar("T") 36 | 37 | 38 | class RequestBatcher: 39 | """Batches multiple requests together to reduce overhead. 40 | 41 | This is particularly useful for APIs that support batch operations 42 | or when network latency dominates over processing time. 43 | 44 | The batcher accumulates requests until either: 45 | 1. The batch size threshold is reached 46 | 2. The batch timeout expires 47 | 48 | At which point all accumulated requests are processed together. 49 | """ 50 | 51 | def __init__( 52 | self, 53 | batch_func: Callable[[list[Any]], Coroutine[Any, Any, list[Any]]], 54 | batch_size: int = 10, 55 | batch_timeout: float = 0.05, # 50ms 56 | ): 57 | """Initialize the batcher. 58 | 59 | Args: 60 | batch_func: Async function that processes a batch of requests 61 | batch_size: Maximum number of requests to batch together 62 | batch_timeout: Maximum time to wait for batch to fill (seconds) 63 | """ 64 | self.batch_func = batch_func 65 | self.batch_size = batch_size 66 | self.batch_timeout = batch_timeout 67 | self.pending_requests: list[tuple[Any, asyncio.Future]] = [] 68 | self.batch_task: asyncio.Task | None = None 69 | self._lock = asyncio.Lock() 70 | 71 | async def request(self, params: Any) -> Any: 72 | """Add a request to the batch and wait for result.""" 73 | future: asyncio.Future[Any] = asyncio.Future() 74 | 75 | async with self._lock: 76 | self.pending_requests.append((params, future)) 77 | 78 | # Check if we should flush immediately 79 | if len(self.pending_requests) >= self.batch_size: 80 | await self._flush_batch() 81 | elif not self.batch_task or self.batch_task.done(): 82 | # Start a timer to flush the batch 83 | self.batch_task = asyncio.create_task(self._batch_timer()) 84 | 85 | return await future 86 | 87 | async def _batch_timer(self): 88 | """Timer that flushes the batch after timeout.""" 89 | await asyncio.sleep(self.batch_timeout) 90 | async with self._lock: 91 | await self._flush_batch() 92 | 93 | async def _flush_batch(self): 94 | """Process all pending requests as a batch.""" 95 | if not self.pending_requests: 96 | return 97 | 98 | # Extract current batch 99 | batch = self.pending_requests.copy() 100 | self.pending_requests.clear() 101 | 102 | # Cancel timer if running 103 | if self.batch_task and not self.batch_task.done(): 104 | self.batch_task.cancel() 105 | 106 | # Process batch 107 | try: 108 | params_list = [params for params, _ in batch] 109 | results = await self.batch_func(params_list) 110 | 111 | # Distribute results to futures 112 | for i, (_, future) in enumerate(batch): 113 | if not future.done(): 114 | if i < len(results): 115 | future.set_result(results[i]) 116 | else: 117 | future.set_exception( 118 | Exception(f"No result for request at index {i}") 119 | ) 120 | except Exception as e: 121 | # Propagate error to all futures 122 | for _, future in batch: 123 | if not future.done(): 124 | future.set_exception(e) 125 | 126 | 127 | # Example usage for autocomplete batching 128 | async def batch_autocomplete_requests(requests: list[dict]) -> list[Any]: 129 | """Process multiple autocomplete requests in parallel. 130 | 131 | This is an example implementation that could be used to batch 132 | autocomplete requests more efficiently. 133 | """ 134 | from .articles.autocomplete import EntityRequest, autocomplete 135 | 136 | tasks = [] 137 | for req in requests: 138 | entity_req = EntityRequest(**req) 139 | tasks.append(autocomplete(entity_req)) 140 | 141 | return await asyncio.gather(*tasks) 142 | ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml 1 | [project] 2 | name = "biomcp-python" 3 | version = "0.4.6" 4 | description = "Biomedical Model Context Protocol Server" 5 | authors = [{ name = "Ian Maurer", email = "[email protected]" }] 6 | readme = "README.md" 7 | keywords = ['python'] 8 | requires-python = ">=3.10,<4.0" 9 | classifiers = [ 10 | "Intended Audience :: Developers", 11 | "Programming Language :: Python", 12 | "Programming Language :: Python :: 3", 13 | "Programming Language :: Python :: 3.10", 14 | "Programming Language :: Python :: 3.11", 15 | "Programming Language :: Python :: 3.12", 16 | "Programming Language :: Python :: 3.13", 17 | "Topic :: Software Development :: Libraries :: Python Modules", 18 | ] 19 | dependencies = [ 20 | "certifi>=2025.1.31", 21 | "diskcache>=5.6.3", 22 | "httpx>=0.28.1", 23 | "mcp[cli]>=1.12.3,<2.0.0", 24 | "platformdirs>=4.3.6", 25 | "psutil>=7.0.0", 26 | "pydantic>=2.10.6", 27 | "python-dotenv>=1.0.0", 28 | "rich>=14.0.0", 29 | "typer>=0.15.2", 30 | "uvicorn>=0.34.2", 31 | "alphagenome>=0.1.0", 32 | ] 33 | 34 | [project.urls] 35 | Homepage = "https://genomoncology.com/biomcp/" 36 | Repository = "https://github.com/genomoncology/biomcp" 37 | Documentation = "https://genomoncology.com/biomcp/" 38 | 39 | [dependency-groups] 40 | dev = [ 41 | "pytest>=7.2.0", 42 | "pytest-xdist>=3.5.0", 43 | "pre-commit>=2.20.0", 44 | "tox-uv>=1.11.3", 45 | "deptry>=0.22.0", 46 | "mypy>=0.991", 47 | "pytest-cov>=4.0.0", 48 | "pytest-asyncio>=0.24.0", 49 | "ruff>=0.9.2", 50 | "mkdocs>=1.4.2", 51 | "mkdocs-material>=8.5.10", 52 | "mkdocstrings[python]>=0.26.1", 53 | "anyio>=4.8.0", 54 | # "ipython>=9.0.2", 55 | "pytest-bdd>=8.1.0", 56 | "tomlkit>=0.13.2", 57 | "assertpy>=1.1", 58 | "twine>=4.0.0", 59 | "pandas>=2.0.0", # Used for mocking AlphaGenome responses in tests 60 | "PyYAML>=6.0.0", # Used for mkdocs.yml parsing in scripts 61 | "pydantic-ai>=0.0.14", # For testing Pydantic AI integration 62 | ] 63 | 64 | [project.optional-dependencies] 65 | api = [ 66 | ] 67 | 68 | worker = [ 69 | "fastapi>=0.110.0", 70 | "starlette>=0.36.0", 71 | "uvicorn>=0.28.0", 72 | ] 73 | 74 | [build-system] 75 | requires = ["setuptools >= 61.0"] 76 | build-backend = "setuptools.build_meta" 77 | 78 | [tool.setuptools.package-data] 79 | biomcp = ["resources/*.md"] 80 | 81 | [project.scripts] 82 | biomcp = "biomcp.__main__:main" 83 | 84 | [tool.mypy] 85 | files = ["src"] 86 | ignore_missing_imports = true 87 | disallow_untyped_defs = false 88 | disallow_any_unimported = false 89 | no_implicit_optional = true 90 | check_untyped_defs = false 91 | warn_return_any = false 92 | warn_unused_ignores = true 93 | show_error_codes = true 94 | plugins = [ 95 | "pydantic.mypy" 96 | ] 97 | disable_error_code = [ 98 | "union-attr", 99 | "prop-decorator", 100 | ] 101 | 102 | [tool.pytest.ini_options] 103 | testpaths = ["tests"] 104 | addopts = "--import-mode=importlib" 105 | asyncio_mode = "auto" 106 | asyncio_default_fixture_loop_scope = "function" 107 | markers = [ 108 | "integration: marks tests as integration tests (deselect with '-m \"not integration\"')", 109 | ] 110 | filterwarnings = [ 111 | # Ignore protobuf version warnings from AlphaGenome 112 | "ignore:Protobuf gencode version.*is exactly one major version older.*:UserWarning", 113 | # Ignore false positive warning from pytest-xdist about coroutines 114 | # This occurs during parallel test execution when mock objects are cleaned up 115 | "ignore:coroutine 'search_trials_unified' was never awaited:RuntimeWarning", 116 | ] 117 | 118 | [tool.ruff] 119 | target-version = "py310" 120 | line-length = 79 121 | fix = true 122 | unsafe-fixes = true 123 | 124 | [tool.ruff.lint] 125 | select = [ 126 | # flake8-2020 127 | "YTT", 128 | # flake8-bandit 129 | "S", 130 | # flake8-bugbear 131 | "B", 132 | # flake8-builtins 133 | "A", 134 | # flake8-comprehensions 135 | "C4", 136 | # flake8-debugger 137 | "T10", 138 | # flake8-simplify 139 | "SIM", 140 | # isort 141 | "I", 142 | # mccabe 143 | "C90", 144 | # pycodestyle 145 | "E", "W", 146 | # pyflakes 147 | "F", 148 | # pygrep-hooks 149 | "PGH", 150 | # pyupgrade 151 | "UP", 152 | # ruff 153 | "RUF", 154 | ] 155 | ignore = [ 156 | # LineTooLong 157 | "E501", 158 | # DoNotAssignLambda 159 | "E731", 160 | # Consider unpacking 161 | "RUF005", 162 | # Union for type annotations 163 | "UP007", 164 | # Asserts are ok when I say they are ok. 165 | "S101", 166 | ] 167 | 168 | [tool.ruff.lint.per-file-ignores] 169 | "tests/*" = ["S101"] 170 | "__init__.py" = ["I001"] 171 | "src/biomcp/variants/external.py" = ["C901"] # Complex API interactions are acceptable 172 | 173 | [tool.ruff.format] 174 | preview = true 175 | 176 | [tool.ruff.lint.flake8-bugbear] 177 | extend-immutable-calls = [ 178 | "fastapi.Depends", 179 | "fastapi.Query", 180 | "typer.Argument", 181 | "typer.Option", 182 | ] 183 | 184 | [tool.coverage.report] 185 | skip_empty = true 186 | 187 | [tool.coverage.run] 188 | branch = true 189 | source = ["src"] 190 | omit = [ 191 | "src/*/__main__.py", 192 | "src/*/server.py", 193 | "src/*/http_client.py", 194 | ] 195 | 196 | [tool.deptry] 197 | exclude = [ 198 | "example_scripts/python_sdk.py", 199 | "venv", 200 | ".venv", 201 | ".direnv", 202 | "tests", 203 | ".git", 204 | "build", 205 | "dist", 206 | "scripts", 207 | "spike", 208 | ] 209 | 210 | [tool.deptry.per_rule_ignores] 211 | DEP001 = ["alphagenome"] # Optional dependency, must be installed manually 212 | DEP002 = ["uvicorn"] 213 | DEP003 = ["biomcp", "alphagenome"] 214 | ``` -------------------------------------------------------------------------------- /docs/getting-started/01-quickstart-cli.md: -------------------------------------------------------------------------------- ```markdown 1 | # Quickstart: BioMCP CLI 2 | 3 | Get started with BioMCP in under 5 minutes! This guide walks you through installation and your first biomedical search. 4 | 5 | ## Prerequisites 6 | 7 | - Python 3.10 or higher 8 | - [uv](https://docs.astral.sh/uv/) package manager (recommended) or pip 9 | 10 | ## Installation 11 | 12 | ### Option 1: Using uv (Recommended) 13 | 14 | ```bash 15 | # Install uv if you haven't already 16 | curl -LsSf https://astral.sh/uv/install.sh | sh 17 | 18 | # Install BioMCP 19 | uv tool install biomcp 20 | ``` 21 | 22 | ### Option 2: Using pip 23 | 24 | ```bash 25 | pip install biomcp 26 | ``` 27 | 28 | ## Your First Search 29 | 30 | Let's search for recent articles about BRAF mutations in melanoma: 31 | 32 | ```bash 33 | biomcp article search \ 34 | --gene BRAF --disease melanoma --limit 5 35 | ``` 36 | 37 | This command: 38 | 39 | - Searches PubMed/PubTator3 for articles 40 | - Filters by BRAF gene and melanoma disease 41 | - Returns the 5 most recent results 42 | - Automatically includes cBioPortal cancer genomics data 43 | - Includes preprints from bioRxiv/medRxiv by default 44 | 45 | ## Understanding the Output 46 | 47 | The search returns: 48 | 49 | 1. **cBioPortal Summary** (if gene specified): Cancer genomics data showing mutation frequencies and hotspots 50 | 2. **Article Results**: Each result includes: 51 | - Title and authors 52 | - Journal and publication date 53 | - PubMed ID and direct link 54 | - Abstract snippet 55 | - Annotated entities (genes, diseases, chemicals) 56 | 57 | ## Essential Commands 58 | 59 | ### Search Clinical Trials 60 | 61 | Find active trials for lung cancer: 62 | 63 | ```bash 64 | biomcp trial search \ 65 | --condition "lung cancer" \ 66 | --status RECRUITING --limit 5 67 | ``` 68 | 69 | ### Get Gene Information 70 | 71 | Retrieve details about the TP53 tumor suppressor: 72 | 73 | ```bash 74 | biomcp gene get TP53 75 | ``` 76 | 77 | ### Look Up Drug Information 78 | 79 | Get details about imatinib (Gleevec): 80 | 81 | ```bash 82 | biomcp drug get imatinib 83 | ``` 84 | 85 | ### Search for Genetic Variants 86 | 87 | Find pathogenic variants in the BRCA1 gene: 88 | 89 | ```bash 90 | biomcp variant search \ 91 | --gene BRCA1 --significance pathogenic \ 92 | --limit 5 93 | ``` 94 | 95 | ## Next Steps 96 | 97 | ### Set Up API Keys (Optional but Recommended) 98 | 99 | Some features require API keys for enhanced functionality: 100 | 101 | ```bash 102 | # For NCI clinical trials database 103 | export NCI_API_KEY="your-key-here" 104 | 105 | # For AlphaGenome variant predictions 106 | export ALPHAGENOME_API_KEY="your-key-here" 107 | 108 | # For additional cBioPortal features 109 | export CBIO_TOKEN="your-token-here" 110 | ``` 111 | 112 | See [Authentication and API Keys](03-authentication-and-api-keys.md) for detailed setup. 113 | 114 | ### Explore Advanced Features 115 | 116 | - **Combine Multiple Filters**: 117 | 118 | ```bash 119 | biomcp article search \ 120 | --gene EGFR --disease "lung cancer" \ 121 | --chemical erlotinib 122 | ``` 123 | 124 | - **Use OR Logic in Keywords**: 125 | 126 | ```bash 127 | biomcp article search --gene BRAF --keyword "V600E|p.V600E|c.1799T>A" 128 | ``` 129 | 130 | - **Exclude Preprints**: 131 | ```bash 132 | biomcp article search --gene TP53 --no-preprints 133 | ``` 134 | 135 | ### Get Help 136 | 137 | View all available commands: 138 | 139 | ```bash 140 | biomcp --help 141 | ``` 142 | 143 | Get help for a specific command: 144 | 145 | ```bash 146 | biomcp article search --help 147 | ``` 148 | 149 | ## Common Use Cases 150 | 151 | ### 1. Research a Specific Mutation 152 | 153 | ```bash 154 | # Find articles about EGFR T790M resistance mutation 155 | biomcp article search --gene EGFR \ 156 | --keyword "T790M|p.T790M" \ 157 | --disease "lung cancer" 158 | ``` 159 | 160 | ### 2. Find Trials for a Patient 161 | 162 | ```bash 163 | # Active trials for HER2-positive breast cancer 164 | biomcp trial search \ 165 | --condition "breast cancer" \ 166 | --keyword "HER2 positive" \ 167 | --status RECRUITING 168 | ``` 169 | 170 | ### 3. Investigate Drug Mechanisms 171 | 172 | ```bash 173 | # Get information about pembrolizumab 174 | biomcp drug get pembrolizumab 175 | 176 | # Find articles about its use in melanoma 177 | biomcp article search --chemical pembrolizumab --disease melanoma 178 | ``` 179 | 180 | ## Troubleshooting 181 | 182 | ### Command Not Found 183 | 184 | If `biomcp` is not recognized: 185 | 186 | - Ensure your PATH includes the installation directory 187 | - Try running with full path: `~/.local/bin/biomcp` 188 | - Restart your terminal after installation 189 | 190 | ### No Results Found 191 | 192 | If searches return no results: 193 | 194 | - Check spelling of gene names (use official symbols) 195 | - Try broader search terms 196 | - Remove filters one by one to identify the constraint 197 | 198 | ### API Rate Limits 199 | 200 | If you encounter rate limit errors: 201 | 202 | - Add delays between requests 203 | - Consider setting up API keys for higher limits 204 | - Use the `--limit` parameter to reduce result count 205 | 206 | ## Next Steps 207 | 208 | Now that you've run your first searches, explore these resources: 209 | 210 | 1. **[Complete CLI Reference](../user-guides/01-command-line-interface.md)** - Comprehensive documentation for all commands and options 211 | 2. **[Claude Desktop Integration](02-claude-desktop-integration.md)** - Use BioMCP with AI assistants 212 | 3. **[Set up API Keys](03-authentication-and-api-keys.md)** - Enable advanced features with NCI, AlphaGenome, and cBioPortal 213 | 4. **[How-to Guides](../how-to-guides/01-find-articles-and-cbioportal-data.md)** - Step-by-step tutorials for complex research workflows 214 | 5. **[Deep Researcher Persona](../concepts/02-the-deep-researcher-persona.md)** - Learn about BioMCP's philosophy and methodology 215 | 216 | Happy researching! 🧬🔬 217 | ``` -------------------------------------------------------------------------------- /tests/integration/test_preprints_integration.py: -------------------------------------------------------------------------------- ```python 1 | """Integration tests for preprint search functionality.""" 2 | 3 | import asyncio 4 | 5 | import pytest 6 | 7 | from biomcp.articles.preprints import ( 8 | BiorxivClient, 9 | EuropePMCClient, 10 | PreprintSearcher, 11 | ) 12 | from biomcp.articles.search import PubmedRequest 13 | from biomcp.core import PublicationState 14 | 15 | 16 | class TestBiorxivIntegration: 17 | """Integration tests for bioRxiv API.""" 18 | 19 | @pytest.mark.asyncio 20 | async def test_biorxiv_real_search(self): 21 | """Test real bioRxiv API search.""" 22 | client = BiorxivClient() 23 | 24 | # Try multiple search terms to find one with results 25 | search_terms = ["cancer", "gene", "cell", "protein", "RNA", "DNA"] 26 | results = [] 27 | successful_term = None 28 | 29 | for term in search_terms: 30 | results = await client.search(term) 31 | if len(results) > 0: 32 | successful_term = term 33 | break 34 | 35 | # If no results with any term, the API might be down or have no recent articles 36 | if len(results) == 0: 37 | pytest.skip( 38 | "No results found with any search term - API may be down or have no matching recent articles" 39 | ) 40 | 41 | # Check the structure of results 42 | first_result = results[0] 43 | assert first_result.doi is not None 44 | assert first_result.title is not None 45 | assert first_result.publication_state == PublicationState.PREPRINT 46 | assert "preprint" in first_result.journal.lower() 47 | 48 | print( 49 | f"Found {len(results)} bioRxiv results for term '{successful_term}'" 50 | ) 51 | print(f"First result: {first_result.title}") 52 | 53 | 54 | class TestEuropePMCIntegration: 55 | """Integration tests for Europe PMC API.""" 56 | 57 | @pytest.mark.asyncio 58 | async def test_europe_pmc_real_search(self): 59 | """Test real Europe PMC API search for preprints.""" 60 | client = EuropePMCClient() 61 | 62 | # Try multiple search terms to find one with results 63 | search_terms = [ 64 | "cancer", 65 | "gene", 66 | "cell", 67 | "protein", 68 | "SARS-CoV-2", 69 | "COVID", 70 | ] 71 | results = [] 72 | successful_term = None 73 | 74 | for term in search_terms: 75 | results = await client.search(term) 76 | if len(results) > 0: 77 | successful_term = term 78 | break 79 | 80 | # If no results with any term, the API might be down 81 | if len(results) == 0: 82 | pytest.skip( 83 | "No results found with any search term - Europe PMC API may be down" 84 | ) 85 | 86 | # Check the structure 87 | first_result = results[0] 88 | assert first_result.title is not None 89 | assert first_result.publication_state == PublicationState.PREPRINT 90 | 91 | print( 92 | f"Found {len(results)} Europe PMC preprint results for term '{successful_term}'" 93 | ) 94 | print(f"First result: {first_result.title}") 95 | if first_result.doi: 96 | print(f"DOI: {first_result.doi}") 97 | 98 | 99 | class TestPreprintSearcherIntegration: 100 | """Integration tests for combined preprint search.""" 101 | 102 | @pytest.mark.asyncio 103 | async def test_combined_search_real(self): 104 | """Test searching across both preprint sources.""" 105 | searcher = PreprintSearcher() 106 | 107 | # Try different search combinations 108 | search_configs = [ 109 | {"genes": ["TP53"], "diseases": ["cancer"]}, 110 | {"keywords": ["protein", "structure"]}, 111 | {"genes": ["BRAF"], "diseases": ["melanoma"]}, 112 | {"keywords": ["gene", "expression"]}, 113 | ] 114 | 115 | response = None 116 | successful_config = None 117 | 118 | for config in search_configs: 119 | request = PubmedRequest(**config) 120 | response = await searcher.search(request) 121 | if response.count > 0: 122 | successful_config = config 123 | break 124 | 125 | print(f"Total results: {response.count if response else 0}") 126 | 127 | # Check if we got any results 128 | if response and response.count > 0: 129 | # Check result structure 130 | first = response.results[0] 131 | assert first.title is not None 132 | assert first.publication_state == PublicationState.PREPRINT 133 | 134 | print(f"Successful search config: {successful_config}") 135 | print(f"First result: {first.title}") 136 | print(f"Date: {first.date}") 137 | print(f"Journal: {first.journal}") 138 | else: 139 | pytest.skip( 140 | "No results found with any search configuration - APIs may be down" 141 | ) 142 | 143 | 144 | if __name__ == "__main__": 145 | # Run the tests directly 146 | asyncio.run(TestBiorxivIntegration().test_biorxiv_real_search()) 147 | print("\n" + "=" * 50 + "\n") 148 | asyncio.run(TestEuropePMCIntegration().test_europe_pmc_real_search()) 149 | print("\n" + "=" * 50 + "\n") 150 | asyncio.run(TestPreprintSearcherIntegration().test_combined_search_real()) 151 | ``` -------------------------------------------------------------------------------- /docs/developer-guides/05-error-handling.md: -------------------------------------------------------------------------------- ```markdown 1 | # Error Handling Guide 2 | 3 | ## Overview 4 | 5 | BioMCP uses a consistent error handling pattern across all HTTP operations. This guide explains the error types, when they occur, and how to handle them. 6 | 7 | ## Error Structure 8 | 9 | All HTTP operations return a tuple: `(data, error)` where one is always `None`. 10 | 11 | ```python 12 | data, error = await http_client.request_api(...) 13 | if error: 14 | # Handle error 15 | logger.error(f"Request failed: {error.code} - {error.message}") 16 | else: 17 | # Process data 18 | process_result(data) 19 | ``` 20 | 21 | ## Error Types 22 | 23 | ### Network Errors 24 | 25 | - **When**: Connection timeout, DNS resolution failure, network unreachable 26 | - **Error Code**: Various HTTP client exceptions 27 | - **Handling**: Retry with exponential backoff or fail gracefully 28 | 29 | ### HTTP Status Errors 30 | 31 | - **When**: Server returns 4xx or 5xx status codes 32 | - **Error Codes**: 33 | - `400-499`: Client errors (bad request, unauthorized, not found) 34 | - `500-599`: Server errors (internal error, service unavailable) 35 | - **Handling**: 36 | - 4xx: Fix request parameters or authentication 37 | - 5xx: Retry with backoff or use cached data 38 | 39 | ### Circuit Breaker Errors 40 | 41 | - **When**: Too many consecutive failures to a domain 42 | - **Error**: Circuit breaker opens to prevent cascading failures 43 | - **Handling**: Wait for recovery timeout or use alternative data source 44 | 45 | ### Offline Mode Errors 46 | 47 | - **When**: `BIOMCP_OFFLINE=true` and no cached data available 48 | - **Error**: Request blocked in offline mode 49 | - **Handling**: Use cached data only or inform user about offline status 50 | 51 | ### Parse Errors 52 | 53 | - **When**: Response is not valid JSON or doesn't match expected schema 54 | - **Error**: JSON decode error or validation error 55 | - **Handling**: Log error and treat as service issue 56 | 57 | ## Best Practices 58 | 59 | ### 1. Always Check Errors 60 | 61 | ```python 62 | # ❌ Bad - ignoring error 63 | data, _ = await http_client.request_api(...) 64 | process(data) # data might be None! 65 | 66 | # ✅ Good - checking error 67 | data, error = await http_client.request_api(...) 68 | if error: 69 | logger.warning(f"Failed to fetch data: {error}") 70 | return None 71 | process(data) 72 | ``` 73 | 74 | ### 2. Provide Context in Error Messages 75 | 76 | ```python 77 | # ❌ Bad - generic error 78 | if error: 79 | logger.error("Request failed") 80 | 81 | # ✅ Good - contextual error 82 | if error: 83 | logger.error(f"Failed to fetch gene {gene_id} from cBioPortal: {error.message}") 84 | ``` 85 | 86 | ### 3. Graceful Degradation 87 | 88 | ```python 89 | async def get_variant_with_fallback(variant_id: str): 90 | # Try primary source 91 | data, error = await primary_source.get_variant(variant_id) 92 | if not error: 93 | return data 94 | 95 | logger.warning(f"Primary source failed: {error}, trying secondary") 96 | 97 | # Try secondary source 98 | data, error = await secondary_source.get_variant(variant_id) 99 | if not error: 100 | return data 101 | 102 | # Use cached data as last resort 103 | return get_cached_variant(variant_id) 104 | ``` 105 | 106 | ### 4. User-Friendly Error Messages 107 | 108 | ```python 109 | def format_error_for_user(error: RequestError) -> str: 110 | if error.code >= 500: 111 | return "The service is temporarily unavailable. Please try again later." 112 | elif error.code == 404: 113 | return "The requested data was not found." 114 | elif error.code == 401: 115 | return "Authentication required. Please check your credentials." 116 | elif "OFFLINE" in str(error): 117 | return "You are in offline mode. Only cached data is available." 118 | else: 119 | return "An error occurred while fetching data. Please try again." 120 | ``` 121 | 122 | ## Testing Error Conditions 123 | 124 | ### 1. Simulate Network Errors 125 | 126 | ```python 127 | with patch("biomcp.http_client.call_http") as mock: 128 | mock.side_effect = Exception("Network error") 129 | data, error = await client.fetch_data() 130 | assert error is not None 131 | assert data is None 132 | ``` 133 | 134 | ### 2. Test Circuit Breaker 135 | 136 | ```python 137 | # Simulate multiple failures 138 | for _ in range(5): 139 | with patch("biomcp.http_client.call_http") as mock: 140 | mock.return_value = (500, "Server Error") 141 | await client.fetch_data() 142 | 143 | # Circuit should be open 144 | data, error = await client.fetch_data() 145 | assert error is not None 146 | assert "circuit" in error.message.lower() 147 | ``` 148 | 149 | ### 3. Test Offline Mode 150 | 151 | ```python 152 | with patch.dict(os.environ, {"BIOMCP_OFFLINE": "true"}): 153 | data, error = await client.fetch_data() 154 | # Should only return cached data or error 155 | ``` 156 | 157 | ## Common Patterns 158 | 159 | ### Retry with Backoff 160 | 161 | The centralized HTTP client automatically retries with exponential backoff for: 162 | 163 | - Network errors 164 | - 5xx server errors 165 | - Rate limit errors (429) 166 | 167 | ### Caching 168 | 169 | Failed requests don't overwrite cached data, ensuring availability during outages. 170 | 171 | ### Rate Limiting 172 | 173 | Requests are automatically rate-limited per domain to prevent overwhelming services. 174 | 175 | ## Debugging 176 | 177 | Enable debug logging to see all HTTP requests and errors: 178 | 179 | ```python 180 | import logging 181 | logging.getLogger("biomcp.http_client").setLevel(logging.DEBUG) 182 | ``` 183 | 184 | This will show: 185 | 186 | - All HTTP requests with URLs and methods 187 | - Response status codes and times 188 | - Error details and retry attempts 189 | - Circuit breaker state changes 190 | ``` -------------------------------------------------------------------------------- /src/biomcp/openfda/cache.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Simple in-memory caching for OpenFDA API responses. 3 | 4 | This module provides a time-based cache to reduce API calls and improve performance. 5 | Cache entries expire after a configurable TTL (time-to-live). 6 | """ 7 | 8 | import hashlib 9 | import json 10 | import logging 11 | import os 12 | from datetime import datetime, timedelta 13 | from typing import Any 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | # Cache configuration 18 | CACHE_TTL_MINUTES = int(os.environ.get("BIOMCP_FDA_CACHE_TTL", "15")) 19 | MAX_CACHE_SIZE = int(os.environ.get("BIOMCP_FDA_MAX_CACHE_SIZE", "100")) 20 | MAX_RESPONSE_SIZE = int( 21 | os.environ.get("BIOMCP_FDA_MAX_RESPONSE_SIZE", str(1024 * 1024)) 22 | ) # 1MB default 23 | 24 | # Global cache dictionary 25 | _cache: dict[str, tuple[Any, datetime]] = {} 26 | 27 | 28 | def _generate_cache_key(endpoint: str, params: dict[str, Any]) -> str: 29 | """ 30 | Generate a unique cache key for an API request. 31 | 32 | Args: 33 | endpoint: The API endpoint URL 34 | params: Query parameters 35 | 36 | Returns: 37 | A unique hash key for the request 38 | """ 39 | # Remove sensitive parameters before hashing 40 | safe_params = { 41 | k: v 42 | for k, v in params.items() 43 | if k.lower() not in ["api_key", "apikey", "key", "token", "secret"] 44 | } 45 | 46 | # Sort params for consistent hashing 47 | sorted_params = json.dumps(safe_params, sort_keys=True) 48 | combined = f"{endpoint}:{sorted_params}" 49 | 50 | # Use SHA256 for cache key 51 | return hashlib.sha256(combined.encode()).hexdigest() 52 | 53 | 54 | def get_cached_response( 55 | endpoint: str, params: dict[str, Any] 56 | ) -> dict[str, Any] | None: 57 | """ 58 | Retrieve a cached response if available and not expired. 59 | 60 | Args: 61 | endpoint: The API endpoint URL 62 | params: Query parameters 63 | 64 | Returns: 65 | Cached response data or None if not found/expired 66 | """ 67 | cache_key = _generate_cache_key(endpoint, params) 68 | 69 | if cache_key in _cache: 70 | data, timestamp = _cache[cache_key] 71 | 72 | # Check if cache entry is still valid 73 | age = datetime.now() - timestamp 74 | if age < timedelta(minutes=CACHE_TTL_MINUTES): 75 | logger.debug( 76 | f"Cache hit for {endpoint} (age: {age.total_seconds():.1f}s)" 77 | ) 78 | return data 79 | else: 80 | # Remove expired entry 81 | del _cache[cache_key] 82 | logger.debug(f"Cache expired for {endpoint}") 83 | 84 | return None 85 | 86 | 87 | def set_cached_response( 88 | endpoint: str, params: dict[str, Any], response: dict[str, Any] 89 | ) -> None: 90 | """ 91 | Store a response in the cache. 92 | 93 | Args: 94 | endpoint: The API endpoint URL 95 | params: Query parameters 96 | response: Response data to cache 97 | """ 98 | # Check response size limit 99 | import json 100 | import sys 101 | 102 | # Better size estimation using JSON serialization 103 | try: 104 | response_json = json.dumps(response) 105 | response_size = len(response_json.encode("utf-8")) 106 | except (TypeError, ValueError): 107 | # If can't serialize, use sys.getsizeof 108 | response_size = sys.getsizeof(response) 109 | 110 | if response_size > MAX_RESPONSE_SIZE: 111 | logger.warning( 112 | f"Response too large to cache: {response_size} bytes > {MAX_RESPONSE_SIZE} bytes" 113 | ) 114 | return 115 | 116 | # Check cache size limit 117 | if len(_cache) >= MAX_CACHE_SIZE: 118 | # Remove oldest entries (simple FIFO) 119 | oldest_keys = sorted(_cache.keys(), key=lambda k: _cache[k][1])[ 120 | : len(_cache) - MAX_CACHE_SIZE + 1 121 | ] 122 | 123 | for key in oldest_keys: 124 | del _cache[key] 125 | 126 | logger.debug( 127 | f"Cache size limit reached, removed {len(oldest_keys)} entries" 128 | ) 129 | 130 | cache_key = _generate_cache_key(endpoint, params) 131 | _cache[cache_key] = (response, datetime.now()) 132 | 133 | logger.debug(f"Cached response for {endpoint} (cache size: {len(_cache)})") 134 | 135 | 136 | def clear_cache() -> None: 137 | """Clear all cached responses.""" 138 | global _cache 139 | size = len(_cache) 140 | _cache = {} 141 | logger.info(f"Cleared FDA cache ({size} entries)") 142 | 143 | 144 | def get_cache_stats() -> dict[str, Any]: 145 | """ 146 | Get cache statistics. 147 | 148 | Returns: 149 | Dictionary with cache statistics 150 | """ 151 | now = datetime.now() 152 | valid_count = 0 153 | total_age = 0.0 154 | 155 | for _data, timestamp in _cache.values(): 156 | age = (now - timestamp).total_seconds() 157 | if age < CACHE_TTL_MINUTES * 60: 158 | valid_count += 1 159 | total_age += age 160 | 161 | avg_age = total_age / valid_count if valid_count > 0 else 0 162 | 163 | return { 164 | "total_entries": len(_cache), 165 | "valid_entries": valid_count, 166 | "expired_entries": len(_cache) - valid_count, 167 | "average_age_seconds": avg_age, 168 | "ttl_minutes": CACHE_TTL_MINUTES, 169 | "max_size": MAX_CACHE_SIZE, 170 | } 171 | 172 | 173 | def is_cacheable_request(endpoint: str, params: dict[str, Any]) -> bool: 174 | """ 175 | Determine if a request should be cached. 176 | 177 | Args: 178 | endpoint: The API endpoint URL 179 | params: Query parameters 180 | 181 | Returns: 182 | True if the request should be cached 183 | """ 184 | # Don't cache if caching is disabled 185 | if CACHE_TTL_MINUTES <= 0: 186 | return False 187 | 188 | # Don't cache very large requests 189 | return params.get("limit", 0) <= 100 190 | ``` -------------------------------------------------------------------------------- /tests/tdd/drugs/test_drug_getter.py: -------------------------------------------------------------------------------- ```python 1 | """Unit tests for drug information retrieval.""" 2 | 3 | import json 4 | 5 | import pytest 6 | 7 | from biomcp.drugs.getter import get_drug 8 | 9 | 10 | class TestDrugGetter: 11 | """Test drug information retrieval.""" 12 | 13 | @pytest.fixture 14 | def mock_drug_response(self): 15 | """Mock drug response from MyChem.info.""" 16 | return { 17 | "_id": "CHEMBL941", 18 | "name": "Imatinib", 19 | "drugbank": { 20 | "id": "DB00619", 21 | "name": "Imatinib", 22 | "description": "Imatinib is a tyrosine kinase inhibitor...", 23 | "indication": "Treatment of chronic myeloid leukemia...", 24 | "mechanism_of_action": "Inhibits BCR-ABL tyrosine kinase...", 25 | "products": {"name": ["Gleevec", "Glivec"]}, 26 | }, 27 | "chembl": { 28 | "molecule_chembl_id": "CHEMBL941", 29 | "pref_name": "IMATINIB", 30 | }, 31 | "pubchem": {"cid": 5291}, 32 | "chebi": {"id": "CHEBI:45783", "name": "imatinib"}, 33 | "inchikey": "KTUFNOKKBVMGRW-UHFFFAOYSA-N", 34 | "formula": "C29H31N7O", 35 | } 36 | 37 | @pytest.mark.asyncio 38 | async def test_get_drug_by_name(self, monkeypatch, mock_drug_response): 39 | """Test getting drug by name.""" 40 | # Mock the API call 41 | call_count = 0 42 | responses = [ 43 | # Query response 44 | ({"hits": [{"_id": "CHEMBL941"}]}, None), 45 | # Get response 46 | (mock_drug_response, None), 47 | ] 48 | 49 | async def mock_request_api(url, request, method, domain): 50 | nonlocal call_count 51 | result = responses[call_count] 52 | call_count += 1 53 | return result 54 | 55 | monkeypatch.setattr("biomcp.http_client.request_api", mock_request_api) 56 | 57 | result = await get_drug("imatinib") 58 | 59 | assert "## Drug: Imatinib" in result 60 | assert "DrugBank ID**: DB00619" in result 61 | assert "ChEMBL ID**: CHEMBL941" in result 62 | assert "Formula**: C29H31N7O" in result 63 | assert "Trade Names**: Gleevec, Glivec" in result 64 | assert "External Links" in result 65 | assert "DrugBank](https://www.drugbank.ca/drugs/DB00619)" in result 66 | 67 | @pytest.mark.asyncio 68 | async def test_get_drug_by_id(self, monkeypatch, mock_drug_response): 69 | """Test getting drug by DrugBank ID.""" 70 | 71 | # Mock the API call 72 | async def mock_request_api(url, request, method, domain): 73 | return (mock_drug_response, None) 74 | 75 | monkeypatch.setattr("biomcp.http_client.request_api", mock_request_api) 76 | 77 | result = await get_drug("DB00619") 78 | 79 | assert "## Drug: Imatinib" in result 80 | assert "DrugBank ID**: DB00619" in result 81 | 82 | @pytest.mark.asyncio 83 | async def test_get_drug_json_output(self, monkeypatch, mock_drug_response): 84 | """Test getting drug with JSON output.""" 85 | 86 | # Mock the API call 87 | async def mock_request_api(url, request, method, domain): 88 | return (mock_drug_response, None) 89 | 90 | monkeypatch.setattr("biomcp.http_client.request_api", mock_request_api) 91 | 92 | result = await get_drug("DB00619", output_json=True) 93 | data = json.loads(result) 94 | 95 | assert data["drug_id"] == "CHEMBL941" 96 | assert data["name"] == "Imatinib" 97 | assert data["drugbank_id"] == "DB00619" 98 | assert ( 99 | data["_links"]["DrugBank"] 100 | == "https://www.drugbank.ca/drugs/DB00619" 101 | ) 102 | 103 | @pytest.mark.asyncio 104 | async def test_drug_not_found(self, monkeypatch): 105 | """Test drug not found.""" 106 | 107 | # Mock the API call 108 | async def mock_request_api(url, request, method, domain): 109 | return ({"hits": []}, None) 110 | 111 | monkeypatch.setattr("biomcp.http_client.request_api", mock_request_api) 112 | 113 | result = await get_drug("INVALID_DRUG_XYZ") 114 | 115 | assert "Drug 'INVALID_DRUG_XYZ' not found" in result 116 | 117 | @pytest.mark.asyncio 118 | async def test_drug_with_description_truncation(self, monkeypatch): 119 | """Test drug with long description gets truncated.""" 120 | long_desc = "A" * 600 121 | mock_response = { 122 | "_id": "TEST001", 123 | "name": "TestDrug", 124 | "drugbank": {"id": "DB99999", "description": long_desc}, 125 | } 126 | 127 | async def mock_request_api(url, request, method, domain): 128 | return (mock_response, None) 129 | 130 | monkeypatch.setattr("biomcp.http_client.request_api", mock_request_api) 131 | 132 | result = await get_drug("DB99999") 133 | 134 | assert "Description" in result 135 | assert "A" * 500 in result 136 | assert "..." in result # Truncation indicator 137 | 138 | @pytest.mark.asyncio 139 | async def test_drug_error_handling(self, monkeypatch): 140 | """Test error handling.""" 141 | 142 | # Mock the API call to raise an exception 143 | async def mock_request_api(url, request, method, domain): 144 | raise Exception("API error") 145 | 146 | monkeypatch.setattr("biomcp.http_client.request_api", mock_request_api) 147 | 148 | result = await get_drug("imatinib") 149 | 150 | # When an exception occurs, it's caught and the drug is reported as not found 151 | assert "Drug 'imatinib' not found in MyChem.info" in result 152 | ``` -------------------------------------------------------------------------------- /src/biomcp/drugs/getter.py: -------------------------------------------------------------------------------- ```python 1 | """Drug information retrieval from MyChem.info.""" 2 | 3 | import json 4 | import logging 5 | 6 | from ..integrations import BioThingsClient 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def _add_drug_links(drug_info, result: dict) -> None: 12 | """Add external database links for the drug.""" 13 | links = {} 14 | 15 | if drug_info.drugbank_id: 16 | links["DrugBank"] = ( 17 | f"https://www.drugbank.ca/drugs/{drug_info.drugbank_id}" 18 | ) 19 | 20 | if drug_info.chembl_id: 21 | links["ChEMBL"] = ( 22 | f"https://www.ebi.ac.uk/chembl/compound_report_card/{drug_info.chembl_id}/" 23 | ) 24 | 25 | if drug_info.pubchem_cid: 26 | links["PubChem"] = ( 27 | f"https://pubchem.ncbi.nlm.nih.gov/compound/{drug_info.pubchem_cid}" 28 | ) 29 | 30 | if drug_info.chebi_id: 31 | chebi_id = drug_info.chebi_id.replace("CHEBI:", "") 32 | links["ChEBI"] = ( 33 | f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}" 34 | ) 35 | 36 | if links: 37 | result["_links"] = links 38 | 39 | 40 | def _format_basic_info(drug_info, output_lines: list[str]) -> None: 41 | """Format basic drug information.""" 42 | if drug_info.formula: 43 | output_lines.append(f"- **Formula**: {drug_info.formula}") 44 | 45 | if drug_info.drugbank_id: 46 | output_lines.append(f"- **DrugBank ID**: {drug_info.drugbank_id}") 47 | 48 | if drug_info.chembl_id: 49 | output_lines.append(f"- **ChEMBL ID**: {drug_info.chembl_id}") 50 | 51 | if drug_info.pubchem_cid: 52 | output_lines.append(f"- **PubChem CID**: {drug_info.pubchem_cid}") 53 | 54 | if drug_info.chebi_id: 55 | output_lines.append(f"- **ChEBI ID**: {drug_info.chebi_id}") 56 | 57 | if drug_info.inchikey: 58 | output_lines.append(f"- **InChIKey**: {drug_info.inchikey}") 59 | 60 | 61 | def _format_clinical_info(drug_info, output_lines: list[str]) -> None: 62 | """Format clinical drug information.""" 63 | if drug_info.tradename: 64 | names = drug_info.tradename[:5] # Limit to first 5 65 | output_lines.append(f"- **Trade Names**: {', '.join(names)}") 66 | if len(drug_info.tradename) > 5: 67 | output_lines.append(f" (and {len(drug_info.tradename) - 5} more)") 68 | 69 | if drug_info.description: 70 | desc = drug_info.description[:500] 71 | if len(drug_info.description) > 500: 72 | desc += "..." 73 | output_lines.append(f"\n### Description\n{desc}") 74 | 75 | if drug_info.indication: 76 | ind = drug_info.indication[:500] 77 | if len(drug_info.indication) > 500: 78 | ind += "..." 79 | output_lines.append(f"\n### Indication\n{ind}") 80 | 81 | if drug_info.mechanism_of_action: 82 | moa = drug_info.mechanism_of_action[:500] 83 | if len(drug_info.mechanism_of_action) > 500: 84 | moa += "..." 85 | output_lines.append(f"\n### Mechanism of Action\n{moa}") 86 | 87 | 88 | def _format_drug_output(drug_info, result: dict) -> None: 89 | """Format drug information for text output.""" 90 | output_lines = [f"## Drug: {drug_info.name or 'Unknown'}"] 91 | 92 | _format_basic_info(drug_info, output_lines) 93 | _format_clinical_info(drug_info, output_lines) 94 | 95 | if result.get("_links"): 96 | output_lines.append("\n### External Links") 97 | for name, url in result["_links"].items(): 98 | output_lines.append(f"- [{name}]({url})") 99 | 100 | result["_formatted"] = "\n".join(output_lines) 101 | 102 | 103 | async def get_drug(drug_id_or_name: str, output_json: bool = False) -> str: 104 | """Get drug information from MyChem.info. 105 | 106 | Args: 107 | drug_id_or_name: Drug ID (DrugBank, ChEMBL, etc.) or name 108 | output_json: Return JSON instead of formatted text 109 | 110 | Returns: 111 | Formatted drug information or JSON string 112 | """ 113 | try: 114 | client = BioThingsClient() 115 | drug_info = await client.get_drug_info(drug_id_or_name) 116 | 117 | if not drug_info: 118 | error_msg = f"Drug '{drug_id_or_name}' not found in MyChem.info" 119 | if output_json: 120 | return json.dumps({"error": error_msg}, indent=2) 121 | return error_msg 122 | 123 | # Build result dictionary 124 | result = drug_info.model_dump(by_alias=False, exclude_none=True) 125 | 126 | # Add external links 127 | _add_drug_links(drug_info, result) 128 | 129 | if output_json: 130 | return json.dumps(result, indent=2) 131 | 132 | # Format for text output 133 | _format_drug_output(drug_info, result) 134 | return result["_formatted"] 135 | 136 | except Exception as e: 137 | logger.error(f"Error getting drug info: {e}") 138 | error_msg = f"Error retrieving drug information: {e!s}" 139 | if output_json: 140 | return json.dumps({"error": error_msg}, indent=2) 141 | return error_msg 142 | 143 | 144 | # MCP tool function 145 | async def _drug_details(drug_id_or_name: str) -> str: 146 | """Get drug/chemical information from MyChem.info. 147 | 148 | This tool retrieves comprehensive drug information including: 149 | - Drug identifiers (DrugBank, ChEMBL, PubChem, etc.) 150 | - Chemical properties (formula, InChIKey) 151 | - Trade names and synonyms 152 | - Clinical indications 153 | - Mechanism of action 154 | - Links to external databases 155 | 156 | Args: 157 | drug_id_or_name: Drug name (e.g., "aspirin") or ID (e.g., "DB00945", "CHEMBL25") 158 | 159 | Returns: 160 | Formatted drug information with external database links 161 | """ 162 | return await get_drug(drug_id_or_name, output_json=False) 163 | ``` -------------------------------------------------------------------------------- /src/biomcp/prefetch.py: -------------------------------------------------------------------------------- ```python 1 | """Prefetching system for common queries to improve performance. 2 | 3 | This module implements a prefetching mechanism that warms up caches with 4 | commonly searched biomedical entities during startup. This significantly 5 | improves response times for frequent queries. 6 | 7 | Key Features: 8 | - Prefetches common genes, diseases, and chemicals on startup 9 | - Runs asynchronously to avoid blocking server initialization 10 | - Includes timeout to prevent startup delays 11 | - Graceful error handling if prefetching fails 12 | 13 | The prefetching runs automatically when the MCP server starts via the 14 | lifespan hook in core.py. 15 | 16 | Configuration: 17 | The lists of entities to prefetch can be customized by modifying 18 | the COMMON_GENES, COMMON_DISEASES, and COMMON_CHEMICALS constants. 19 | """ 20 | 21 | import asyncio 22 | import logging 23 | 24 | from .constants import ( 25 | PREFETCH_TIMEOUT, 26 | PREFETCH_TOP_CHEMICALS, 27 | PREFETCH_TOP_DISEASES, 28 | PREFETCH_TOP_GENES, 29 | ) 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | # Common genes that are frequently searched 34 | COMMON_GENES = [ 35 | "BRAF", 36 | "EGFR", 37 | "TP53", 38 | "KRAS", 39 | "ALK", 40 | "ROS1", 41 | "MET", 42 | "RET", 43 | "NTRK1", 44 | "NTRK2", 45 | "NTRK3", 46 | ] 47 | 48 | # Common cancer types 49 | COMMON_DISEASES = [ 50 | "lung cancer", 51 | "breast cancer", 52 | "colorectal cancer", 53 | "melanoma", 54 | "non-small cell lung cancer", 55 | "small cell lung cancer", 56 | ] 57 | 58 | # Common drug names 59 | COMMON_CHEMICALS = [ 60 | "osimertinib", 61 | "pembrolizumab", 62 | "nivolumab", 63 | "dabrafenib", 64 | "trametinib", 65 | "crizotinib", 66 | "alectinib", 67 | ] 68 | 69 | 70 | class PrefetchManager: 71 | """Manages prefetching of common queries.""" 72 | 73 | def __init__(self): 74 | self._prefetch_task: asyncio.Task | None = None 75 | self._is_prefetching = False 76 | self._prefetch_complete = False 77 | 78 | async def start_prefetching(self): 79 | """Start prefetching common queries in the background.""" 80 | if self._is_prefetching or self._prefetch_complete: 81 | return 82 | 83 | self._is_prefetching = True 84 | try: 85 | # Start prefetch task 86 | self._prefetch_task = asyncio.create_task( 87 | self._prefetch_common_queries() 88 | ) 89 | except Exception as e: 90 | logger.warning(f"Failed to start prefetching: {e}") 91 | self._is_prefetching = False 92 | 93 | async def _prefetch_common_queries(self): 94 | """Prefetch common queries to warm up the cache.""" 95 | try: 96 | # Import here to avoid circular imports 97 | from .articles.autocomplete import EntityRequest, autocomplete 98 | from .variants.cbioportal_search import CBioPortalSearchClient 99 | 100 | tasks = [] 101 | 102 | # Prefetch gene autocomplete 103 | for gene in COMMON_GENES[ 104 | :PREFETCH_TOP_GENES 105 | ]: # Limit to avoid overload 106 | request = EntityRequest(concept="gene", query=gene, limit=1) 107 | tasks.append(autocomplete(request)) 108 | 109 | # Prefetch disease autocomplete 110 | for disease in COMMON_DISEASES[:PREFETCH_TOP_DISEASES]: 111 | request = EntityRequest( 112 | concept="disease", query=disease, limit=1 113 | ) 114 | tasks.append(autocomplete(request)) 115 | 116 | # Prefetch chemical autocomplete 117 | for chemical in COMMON_CHEMICALS[:PREFETCH_TOP_CHEMICALS]: 118 | request = EntityRequest( 119 | concept="chemical", query=chemical, limit=1 120 | ) 121 | tasks.append(autocomplete(request)) 122 | 123 | # Execute all autocomplete prefetches 124 | if tasks: 125 | await asyncio.gather(*tasks, return_exceptions=True) 126 | 127 | # Prefetch cBioPortal summaries for common genes 128 | cbio_client = CBioPortalSearchClient() 129 | cbio_tasks = [] 130 | 131 | for gene in COMMON_GENES[:PREFETCH_TOP_GENES]: # Top genes 132 | cbio_tasks.append( 133 | cbio_client.get_gene_search_summary(gene, max_studies=5) 134 | ) 135 | 136 | if cbio_tasks: 137 | await asyncio.gather(*cbio_tasks, return_exceptions=True) 138 | 139 | logger.info("Prefetching completed successfully") 140 | 141 | except Exception as e: 142 | logger.warning(f"Error during prefetching: {e}") 143 | finally: 144 | self._is_prefetching = False 145 | self._prefetch_complete = True 146 | 147 | async def wait_for_prefetch(self, timeout: float = PREFETCH_TIMEOUT): 148 | """Wait for prefetch to complete with timeout.""" 149 | if not self._prefetch_task: 150 | return 151 | 152 | try: 153 | await asyncio.wait_for(self._prefetch_task, timeout=timeout) 154 | except asyncio.TimeoutError: 155 | # Prefetch taking too long, continue without waiting 156 | logger.debug("Prefetch timeout - continuing without waiting") 157 | except Exception as e: 158 | # Ignore prefetch errors 159 | logger.debug(f"Prefetch error ignored: {e}") 160 | 161 | 162 | # Global prefetch manager 163 | _prefetch_manager = PrefetchManager() 164 | 165 | 166 | async def start_prefetching(): 167 | """Start the prefetching process.""" 168 | await _prefetch_manager.start_prefetching() 169 | 170 | 171 | async def wait_for_prefetch(timeout: float = PREFETCH_TIMEOUT): 172 | """Wait for prefetch to complete.""" 173 | await _prefetch_manager.wait_for_prefetch(timeout) 174 | ``` -------------------------------------------------------------------------------- /docs/backend-services-reference/01-overview.md: -------------------------------------------------------------------------------- ```markdown 1 | # Backend Services Reference Overview 2 | 3 | BioMCP integrates with multiple biomedical databases and services to provide comprehensive research capabilities. This reference documents the underlying APIs and their capabilities. 4 | 5 | ## Service Categories 6 | 7 | ### Literature and Publications 8 | 9 | - **[PubTator3](06-pubtator3.md)**: Biomedical literature with entity annotations 10 | - **Europe PMC**: Preprints from bioRxiv and medRxiv 11 | 12 | ### Clinical Trials 13 | 14 | - **[ClinicalTrials.gov](04-clinicaltrials-gov.md)**: U.S. and international clinical trials registry 15 | - **[NCI CTS API](05-nci-cts-api.md)**: National Cancer Institute's enhanced trial search 16 | 17 | ### Biomedical Annotations 18 | 19 | - **[BioThings Suite](02-biothings-suite.md)**: 20 | - MyGene.info - Gene annotations 21 | - MyVariant.info - Variant annotations 22 | - MyDisease.info - Disease ontology 23 | - MyChem.info - Drug/chemical data 24 | 25 | ### Cancer Genomics 26 | 27 | - **[cBioPortal](03-cbioportal.md)**: Cancer genomics portal with mutation data 28 | - **TCGA**: The Cancer Genome Atlas (via MyVariant.info) 29 | 30 | ### Variant Effect Prediction 31 | 32 | - **[AlphaGenome](07-alphagenome.md)**: Google DeepMind's AI for regulatory predictions 33 | 34 | ## API Authentication 35 | 36 | | Service | Authentication Required | Type | Rate Limits | 37 | | ------------------ | ----------------------- | ------- | ------------------- | 38 | | PubTator3 | No | Public | 3 requests/second | 39 | | ClinicalTrials.gov | No | Public | 50,000 requests/day | 40 | | NCI CTS API | Yes | API Key | 1,000 requests/day | 41 | | BioThings APIs | No | Public | 1,000 requests/hour | 42 | | cBioPortal | Optional | Token | Higher with token | 43 | | AlphaGenome | Yes | API Key | Contact provider | 44 | 45 | ## Data Flow Architecture 46 | 47 | ``` 48 | User Query → BioMCP Tools → Backend APIs → Unified Response 49 | 50 | Example Flow: 51 | 1. User: "Find articles about BRAF mutations" 52 | 2. BioMCP: article_searcher tool 53 | 3. APIs Called: 54 | - PubTator3 (articles) 55 | - cBioPortal (mutation data) 56 | - Europe PMC (preprints) 57 | 4. Response: Integrated results with citations 58 | ``` 59 | 60 | ## Service Reliability 61 | 62 | ### Primary Services 63 | 64 | - **PubTator3**: 99.9% uptime, updated daily 65 | - **ClinicalTrials.gov**: 99.5% uptime, updated daily 66 | - **BioThings APIs**: 99.9% uptime, real-time data 67 | 68 | ### Fallback Strategies 69 | 70 | - Cache frequently accessed data 71 | - Implement exponential backoff 72 | - Use alternative endpoints when available 73 | 74 | ## Common Integration Patterns 75 | 76 | ### 1. Entity Recognition Enhancement 77 | 78 | ``` 79 | PubTator3 → Extract entities → BioThings → Get detailed annotations 80 | ``` 81 | 82 | ### 2. Variant to Trial Pipeline 83 | 84 | ``` 85 | MyVariant.info → Get gene → ClinicalTrials.gov → Find relevant trials 86 | ``` 87 | 88 | ### 3. Comprehensive Gene Analysis 89 | 90 | ``` 91 | MyGene.info → Basic info 92 | cBioPortal → Cancer mutations 93 | PubTator3 → Literature 94 | AlphaGenome → Predictions 95 | ``` 96 | 97 | ## Performance Considerations 98 | 99 | ### Response Times (typical) 100 | 101 | - PubTator3: 200-500ms 102 | - ClinicalTrials.gov: 300-800ms 103 | - BioThings APIs: 100-300ms 104 | - cBioPortal: 200-600ms 105 | - AlphaGenome: 1-3 seconds 106 | 107 | ### Optimization Strategies 108 | 109 | 1. **Batch requests** when APIs support it 110 | 2. **Cache static data** (gene names, ontologies) 111 | 3. **Parallelize independent** API calls 112 | 4. **Use pagination** for large result sets 113 | 114 | ## Error Handling 115 | 116 | ### Common Error Types 117 | 118 | - **Rate Limiting**: 429 errors, implement backoff 119 | - **Invalid Parameters**: 400 errors, validate inputs 120 | - **Service Unavailable**: 503 errors, retry with delay 121 | - **Authentication**: 401 errors, check API keys 122 | 123 | ### Error Response Format 124 | 125 | ```json 126 | { 127 | "error": { 128 | "code": "RATE_LIMIT_EXCEEDED", 129 | "message": "API rate limit exceeded", 130 | "retry_after": 3600 131 | } 132 | } 133 | ``` 134 | 135 | ## Data Formats 136 | 137 | ### Input Formats 138 | 139 | - **Identifiers**: HGNC symbols, rsIDs, NCT numbers, PMIDs 140 | - **Coordinates**: GRCh38 genomic positions 141 | - **Terms**: MeSH, MONDO, HPO ontologies 142 | 143 | ### Output Formats 144 | 145 | - **JSON**: Primary format for all APIs 146 | - **XML**: Available for some services 147 | - **TSV/CSV**: Export options for bulk data 148 | 149 | ## Update Frequencies 150 | 151 | | Service | Update Frequency | Data Lag | 152 | | ------------------ | ---------------- | ---------- | 153 | | PubTator3 | Daily | 1-2 days | 154 | | ClinicalTrials.gov | Daily | Real-time | 155 | | NCI CTS | Daily | 1 day | 156 | | BioThings | Real-time | Minutes | 157 | | cBioPortal | Quarterly | 3-6 months | 158 | 159 | ## Best Practices 160 | 161 | ### 1. API Key Management 162 | 163 | - Store keys securely 164 | - Rotate keys periodically 165 | - Monitor usage against limits 166 | 167 | ### 2. Error Recovery 168 | 169 | - Implement retry logic 170 | - Log failed requests 171 | - Provide fallback data 172 | 173 | ### 3. Data Validation 174 | 175 | - Verify gene symbols 176 | - Validate genomic coordinates 177 | - Check identifier formats 178 | 179 | ### 4. Performance 180 | 181 | - Cache when appropriate 182 | - Batch similar requests 183 | - Use appropriate page sizes 184 | 185 | ## Getting Started 186 | 187 | 1. Review individual service documentation 188 | 2. Obtain necessary API keys 189 | 3. Test endpoints with sample data 190 | 4. Implement error handling 191 | 5. Monitor usage and performance 192 | 193 | ## Support Resources 194 | 195 | - **PubTator3**: [Support Forum](https://www.ncbi.nlm.nih.gov/research/pubtator3/) 196 | - **ClinicalTrials.gov**: [Help Desk](https://clinicaltrials.gov/help) 197 | - **BioThings**: [Documentation](https://docs.biothings.io/) 198 | - **cBioPortal**: [User Guide](https://docs.cbioportal.org/) 199 | - **NCI**: [API Support](https://api.cancer.gov/support) 200 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_concurrent_requests.py: -------------------------------------------------------------------------------- ```python 1 | """Test concurrent request handling in the HTTP client.""" 2 | 3 | import asyncio 4 | from unittest.mock import AsyncMock, patch 5 | 6 | import pytest 7 | 8 | from biomcp import http_client 9 | 10 | 11 | class TestConcurrentRequests: 12 | """Test concurrent request handling.""" 13 | 14 | @pytest.mark.asyncio 15 | async def test_concurrent_requests_same_domain(self): 16 | """Test multiple concurrent requests to the same domain.""" 17 | # Use patch instead of direct replacement 18 | with patch( 19 | "biomcp.http_client.call_http", new_callable=AsyncMock 20 | ) as mock_call: 21 | # Configure mock to return success 22 | mock_call.return_value = (200, '{"data": "response"}') 23 | 24 | # Make 10 concurrent requests with different URLs to avoid caching 25 | # and disable caching explicitly 26 | tasks = [ 27 | http_client.request_api( 28 | url=f"https://api.example.com/resource/{i}", 29 | request={}, 30 | domain="example", 31 | cache_ttl=0, # Disable caching 32 | ) 33 | for i in range(10) 34 | ] 35 | 36 | results = await asyncio.gather(*tasks) 37 | 38 | # All requests should succeed 39 | assert len(results) == 10 40 | for data, error in results: 41 | assert error is None 42 | assert data == {"data": "response"} 43 | 44 | # Check that rate limiting was applied 45 | assert mock_call.call_count == 10 46 | 47 | @pytest.mark.asyncio 48 | async def test_concurrent_requests_different_domains(self): 49 | """Test concurrent requests to different domains.""" 50 | with patch( 51 | "biomcp.http_client.call_http", new_callable=AsyncMock 52 | ) as mock_call: 53 | # Return different responses based on URL 54 | async def side_effect(method, url, *args, **kwargs): 55 | if "domain1" in url: 56 | return (200, '{"source": "domain1"}') 57 | elif "domain2" in url: 58 | return (200, '{"source": "domain2"}') 59 | else: 60 | return (200, '{"source": "other"}') 61 | 62 | mock_call.side_effect = side_effect 63 | 64 | # Make requests to different domains 65 | tasks = [ 66 | http_client.request_api( 67 | "https://domain1.com/api", {}, domain="domain1" 68 | ), 69 | http_client.request_api( 70 | "https://domain2.com/api", {}, domain="domain2" 71 | ), 72 | http_client.request_api( 73 | "https://domain3.com/api", {}, domain="domain3" 74 | ), 75 | ] 76 | 77 | results = await asyncio.gather(*tasks) 78 | 79 | # Check results 80 | assert results[0][0] == {"source": "domain1"} 81 | assert results[1][0] == {"source": "domain2"} 82 | assert results[2][0] == {"source": "other"} 83 | 84 | @pytest.mark.asyncio 85 | async def test_concurrent_cache_access(self): 86 | """Test that concurrent requests properly use cache.""" 87 | with patch( 88 | "biomcp.http_client.call_http", new_callable=AsyncMock 89 | ) as mock_call: 90 | mock_call.return_value = (200, '{"data": "cached"}') 91 | 92 | # First request to populate cache 93 | await http_client.request_api( 94 | url="https://api.example.com/data", 95 | request={}, 96 | domain="example", 97 | cache_ttl=60, 98 | ) 99 | 100 | # Reset call count 101 | initial_calls = mock_call.call_count 102 | 103 | # Make 5 concurrent requests to same URL 104 | tasks = [ 105 | http_client.request_api( 106 | url="https://api.example.com/data", 107 | request={}, 108 | domain="example", 109 | cache_ttl=60, 110 | ) 111 | for _ in range(5) 112 | ] 113 | 114 | results = await asyncio.gather(*tasks) 115 | 116 | # All should get cached response 117 | assert len(results) == 5 118 | for data, _error in results: 119 | assert data == {"data": "cached"} 120 | 121 | # No additional HTTP calls should have been made 122 | assert mock_call.call_count == initial_calls 123 | 124 | @pytest.mark.asyncio 125 | async def test_concurrent_circuit_breaker(self): 126 | """Test circuit breaker behavior with concurrent failures.""" 127 | with patch( 128 | "biomcp.http_client.call_http", new_callable=AsyncMock 129 | ) as mock_call: 130 | # Simulate failures 131 | mock_call.return_value = (500, "Internal Server Error") 132 | 133 | # Make concurrent failing requests 134 | tasks = [ 135 | http_client.request_api( 136 | url=f"https://failing.com/api/{i}", 137 | request={}, 138 | domain="failing", 139 | ) 140 | for i in range(10) 141 | ] 142 | 143 | results = await asyncio.gather(*tasks, return_exceptions=True) 144 | 145 | # All should fail 146 | error_count = sum(1 for _, error in results if error is not None) 147 | assert error_count == 10 148 | 149 | # Circuit should be open now 150 | # Additional requests should fail immediately 151 | _, error = await http_client.request_api( 152 | url="https://failing.com/api/test", 153 | request={}, 154 | domain="failing", 155 | ) 156 | 157 | assert error is not None 158 | # Check that circuit breaker is preventing calls 159 | # (exact behavior depends on implementation details) 160 | ``` -------------------------------------------------------------------------------- /tests/tdd/test_connection_pool.py: -------------------------------------------------------------------------------- ```python 1 | """Tests for connection pool management.""" 2 | 3 | import asyncio 4 | import ssl 5 | import weakref 6 | from unittest.mock import patch 7 | 8 | import httpx 9 | import pytest 10 | 11 | from biomcp.connection_pool import ( 12 | EventLoopConnectionPools, 13 | close_all_pools, 14 | get_connection_pool, 15 | ) 16 | 17 | 18 | @pytest.fixture 19 | def pool_manager(): 20 | """Create a fresh pool manager for testing.""" 21 | return EventLoopConnectionPools() 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_get_pool_creates_new_pool(pool_manager): 26 | """Test that get_pool creates a new pool when none exists.""" 27 | timeout = httpx.Timeout(30) 28 | 29 | pool = await pool_manager.get_pool(verify=True, timeout=timeout) 30 | 31 | assert pool is not None 32 | assert isinstance(pool, httpx.AsyncClient) 33 | assert not pool.is_closed 34 | 35 | 36 | @pytest.mark.asyncio 37 | async def test_get_pool_reuses_existing_pool(pool_manager): 38 | """Test that get_pool reuses existing pools.""" 39 | timeout = httpx.Timeout(30) 40 | 41 | pool1 = await pool_manager.get_pool(verify=True, timeout=timeout) 42 | pool2 = await pool_manager.get_pool(verify=True, timeout=timeout) 43 | 44 | assert pool1 is pool2 45 | 46 | 47 | @pytest.mark.asyncio 48 | async def test_get_pool_different_verify_settings(pool_manager): 49 | """Test that different verify settings create different pools.""" 50 | timeout = httpx.Timeout(30) 51 | 52 | pool1 = await pool_manager.get_pool(verify=True, timeout=timeout) 53 | pool2 = await pool_manager.get_pool(verify=False, timeout=timeout) 54 | 55 | assert pool1 is not pool2 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_get_pool_ssl_context(pool_manager): 60 | """Test pool creation with SSL context.""" 61 | ssl_context = ssl.create_default_context() 62 | timeout = httpx.Timeout(30) 63 | 64 | pool = await pool_manager.get_pool(verify=ssl_context, timeout=timeout) 65 | 66 | assert pool is not None 67 | assert isinstance(pool, httpx.AsyncClient) 68 | 69 | 70 | @pytest.mark.asyncio 71 | async def test_pool_cleanup_on_close_all(pool_manager): 72 | """Test that close_all properly closes all pools.""" 73 | timeout = httpx.Timeout(30) 74 | 75 | await pool_manager.get_pool(verify=True, timeout=timeout) 76 | await pool_manager.get_pool(verify=False, timeout=timeout) 77 | 78 | await pool_manager.close_all() 79 | 80 | # After close_all, pools should be cleared 81 | assert len(pool_manager._loop_pools) == 0 82 | 83 | 84 | @pytest.mark.asyncio 85 | async def test_no_event_loop_returns_single_use_client(pool_manager): 86 | """Test behavior when no event loop is running.""" 87 | with patch("asyncio.get_running_loop", side_effect=RuntimeError): 88 | timeout = httpx.Timeout(30) 89 | 90 | pool = await pool_manager.get_pool(verify=True, timeout=timeout) 91 | 92 | assert pool is not None 93 | # Single-use client should have no keepalive 94 | # Note: httpx client internal structure may vary 95 | 96 | 97 | @pytest.mark.asyncio 98 | async def test_pool_recreation_after_close(pool_manager): 99 | """Test that a new pool is created after the old one is closed.""" 100 | timeout = httpx.Timeout(30) 101 | 102 | pool1 = await pool_manager.get_pool(verify=True, timeout=timeout) 103 | await pool1.aclose() 104 | 105 | pool2 = await pool_manager.get_pool(verify=True, timeout=timeout) 106 | 107 | assert pool1 is not pool2 108 | assert pool1.is_closed 109 | assert not pool2.is_closed 110 | 111 | 112 | @pytest.mark.asyncio 113 | async def test_weak_reference_cleanup(): 114 | """Test that weak references are used for event loops.""" 115 | pool_manager = EventLoopConnectionPools() 116 | 117 | # Verify that the pool manager uses weak references 118 | assert isinstance(pool_manager._loop_pools, weakref.WeakKeyDictionary) 119 | 120 | # Create a pool 121 | timeout = httpx.Timeout(30) 122 | pool = await pool_manager.get_pool(verify=True, timeout=timeout) 123 | 124 | # Verify pool was created 125 | assert pool is not None 126 | 127 | # The current event loop should be in the weak key dict 128 | current_loop = asyncio.get_running_loop() 129 | assert current_loop in pool_manager._loop_pools 130 | 131 | 132 | @pytest.mark.asyncio 133 | async def test_global_get_connection_pool(): 134 | """Test the global get_connection_pool function.""" 135 | with patch.dict("os.environ", {"BIOMCP_USE_CONNECTION_POOL": "true"}): 136 | timeout = httpx.Timeout(30) 137 | 138 | pool = await get_connection_pool(verify=True, timeout=timeout) 139 | 140 | assert pool is not None 141 | assert isinstance(pool, httpx.AsyncClient) 142 | 143 | 144 | @pytest.mark.asyncio 145 | async def test_global_close_all_pools(): 146 | """Test the global close_all_pools function.""" 147 | # Create some pools 148 | timeout = httpx.Timeout(30) 149 | await get_connection_pool(verify=True, timeout=timeout) 150 | await get_connection_pool(verify=False, timeout=timeout) 151 | 152 | # Close all pools 153 | await close_all_pools() 154 | 155 | # Verify cleanup (this is implementation-specific) 156 | from biomcp.connection_pool import _pool_manager 157 | 158 | assert len(_pool_manager._loop_pools) == 0 159 | 160 | 161 | @pytest.mark.asyncio 162 | async def test_concurrent_pool_creation(pool_manager): 163 | """Test thread-safe pool creation under concurrent access.""" 164 | timeout = httpx.Timeout(30) 165 | 166 | async def get_pool(): 167 | return await pool_manager.get_pool(verify=True, timeout=timeout) 168 | 169 | # Create 10 concurrent requests for the same pool 170 | pools = await asyncio.gather(*[get_pool() for _ in range(10)]) 171 | 172 | # All should return the same pool instance 173 | assert all(pool is pools[0] for pool in pools) 174 | 175 | 176 | @pytest.mark.asyncio 177 | async def test_connection_pool_limits(): 178 | """Test that connection pools have proper limits set.""" 179 | pool_manager = EventLoopConnectionPools() 180 | timeout = httpx.Timeout(30) 181 | 182 | pool = await pool_manager.get_pool(verify=True, timeout=timeout) 183 | 184 | # Verify pool was created (actual limits are internal to httpx) 185 | assert pool is not None 186 | assert isinstance(pool, httpx.AsyncClient) 187 | ``` -------------------------------------------------------------------------------- /tests/data/myvariant/variants_part_braf_v600_multiple.json: -------------------------------------------------------------------------------- ```json 1 | [ 2 | { 3 | "_id": "chr7:g.140453136A>G", 4 | "_score": 19.419012, 5 | "cadd": { 6 | "_license": "http://bit.ly/2TIuab9", 7 | "phred": 21.2 8 | }, 9 | "chrom": "7", 10 | "clinvar": { 11 | "_license": "http://bit.ly/2SQdcI0", 12 | "rcv": { 13 | "clinical_significance": "Likely pathogenic" 14 | }, 15 | "variant_id": 376288 16 | }, 17 | "cosmic": { 18 | "_license": "http://bit.ly/2VMkY7R", 19 | "cosmic_id": "COSM18443" 20 | }, 21 | "dbnsfp": { 22 | "_license": "http://bit.ly/2VLnQBz", 23 | "genename": ["BRAF", "BRAF", "BRAF", "BRAF"], 24 | "hgvsc": ["c.620T>C", "c.1919T>C", "c.1799T>C"], 25 | "hgvsp": ["p.V600A", "p.Val600Ala", "p.Val640Ala", "p.Val207Ala"], 26 | "polyphen2": { 27 | "hdiv": { 28 | "pred": "B", 29 | "score": 0.207 30 | } 31 | } 32 | }, 33 | "dbsnp": { 34 | "_license": "http://bit.ly/2AqoLOc", 35 | "rsid": "rs113488022" 36 | }, 37 | "vcf": { 38 | "alt": "G", 39 | "position": "140453136", 40 | "ref": "A" 41 | } 42 | }, 43 | { 44 | "_id": "chr7:g.140453136A>T", 45 | "_score": 18.693962, 46 | "cadd": { 47 | "_license": "http://bit.ly/2TIuab9", 48 | "phred": 32 49 | }, 50 | "chrom": "7", 51 | "civic": { 52 | "_license": "http://bit.ly/2FqS871", 53 | "id": 12, 54 | "openCravatUrl": "https://run.opencravat.org/webapps/variantreport/index.html?alt_base=T&chrom=chr7&pos=140753336&ref_base=A" 55 | }, 56 | "clinvar": { 57 | "_license": "http://bit.ly/2SQdcI0", 58 | "rcv": [ 59 | { 60 | "clinical_significance": "Pathogenic" 61 | }, 62 | { 63 | "clinical_significance": "Pathogenic" 64 | }, 65 | { 66 | "clinical_significance": "Pathogenic" 67 | }, 68 | { 69 | "clinical_significance": "Pathogenic" 70 | }, 71 | { 72 | "clinical_significance": "Pathogenic" 73 | }, 74 | { 75 | "clinical_significance": "Pathogenic" 76 | }, 77 | { 78 | "clinical_significance": "Pathogenic" 79 | }, 80 | { 81 | "clinical_significance": "not provided" 82 | }, 83 | { 84 | "clinical_significance": "Likely pathogenic" 85 | }, 86 | { 87 | "clinical_significance": "Likely pathogenic" 88 | }, 89 | { 90 | "clinical_significance": "Likely pathogenic" 91 | }, 92 | { 93 | "clinical_significance": "Likely pathogenic" 94 | }, 95 | { 96 | "clinical_significance": "Likely pathogenic" 97 | }, 98 | { 99 | "clinical_significance": "Likely pathogenic" 100 | }, 101 | { 102 | "clinical_significance": "Likely pathogenic" 103 | }, 104 | { 105 | "clinical_significance": "Pathogenic" 106 | }, 107 | { 108 | "clinical_significance": "Pathogenic" 109 | }, 110 | { 111 | "clinical_significance": "Likely pathogenic" 112 | }, 113 | { 114 | "clinical_significance": "Pathogenic" 115 | }, 116 | { 117 | "clinical_significance": "Likely pathogenic" 118 | }, 119 | { 120 | "clinical_significance": "Likely pathogenic" 121 | }, 122 | { 123 | "clinical_significance": "Pathogenic" 124 | }, 125 | { 126 | "clinical_significance": "Pathogenic" 127 | }, 128 | { 129 | "clinical_significance": "Pathogenic" 130 | }, 131 | { 132 | "clinical_significance": "Pathogenic" 133 | }, 134 | { 135 | "clinical_significance": "Likely pathogenic" 136 | }, 137 | { 138 | "clinical_significance": "Pathogenic" 139 | }, 140 | { 141 | "clinical_significance": "Pathogenic" 142 | }, 143 | { 144 | "clinical_significance": "Likely pathogenic" 145 | } 146 | ], 147 | "variant_id": 13961 148 | }, 149 | "cosmic": { 150 | "_license": "http://bit.ly/2VMkY7R", 151 | "cosmic_id": "COSM476" 152 | }, 153 | "dbnsfp": { 154 | "_license": "http://bit.ly/2VLnQBz", 155 | "genename": ["BRAF", "BRAF", "BRAF", "BRAF"], 156 | "hgvsc": ["c.620T>A", "c.1919T>A", "c.1799T>A"], 157 | "hgvsp": ["p.Val640Glu", "p.Val207Glu", "p.Val600Glu", "p.V600E"], 158 | "polyphen2": { 159 | "hdiv": { 160 | "pred": "D", 161 | "score": 0.971 162 | } 163 | } 164 | }, 165 | "dbsnp": { 166 | "_license": "http://bit.ly/2AqoLOc", 167 | "rsid": "rs113488022" 168 | }, 169 | "exac": { 170 | "_license": "http://bit.ly/2H9c4hg", 171 | "af": 1.647e-5 172 | }, 173 | "gnomad_exome": { 174 | "_license": "http://bit.ly/2I1cl1I", 175 | "af": { 176 | "af": 3.97994e-6 177 | } 178 | }, 179 | "vcf": { 180 | "alt": "T", 181 | "position": "140453136", 182 | "ref": "A" 183 | } 184 | }, 185 | { 186 | "_id": "chr7:g.140453136A>C", 187 | "_score": 18.476965, 188 | "cadd": { 189 | "_license": "http://bit.ly/2TIuab9", 190 | "phred": 26.0 191 | }, 192 | "chrom": "7", 193 | "clinvar": { 194 | "_license": "http://bit.ly/2SQdcI0", 195 | "rcv": [ 196 | { 197 | "clinical_significance": "not provided" 198 | }, 199 | { 200 | "clinical_significance": "Pathogenic" 201 | }, 202 | { 203 | "clinical_significance": "Pathogenic" 204 | }, 205 | { 206 | "clinical_significance": "Uncertain significance" 207 | } 208 | ], 209 | "variant_id": 40389 210 | }, 211 | "cosmic": { 212 | "_license": "http://bit.ly/2VMkY7R", 213 | "cosmic_id": "COSM6137" 214 | }, 215 | "dbnsfp": { 216 | "_license": "http://bit.ly/2VLnQBz", 217 | "genename": ["BRAF", "BRAF", "BRAF", "BRAF"], 218 | "hgvsc": ["c.1919T>G", "c.1799T>G", "c.620T>G"], 219 | "hgvsp": ["p.Val640Gly", "p.Val207Gly", "p.Val600Gly", "p.V600G"], 220 | "polyphen2": { 221 | "hdiv": { 222 | "pred": "P", 223 | "score": 0.822 224 | } 225 | } 226 | }, 227 | "dbsnp": { 228 | "_license": "http://bit.ly/2AqoLOc", 229 | "rsid": "rs113488022" 230 | }, 231 | "vcf": { 232 | "alt": "C", 233 | "position": "140453136", 234 | "ref": "A" 235 | } 236 | } 237 | ] 238 | ``` -------------------------------------------------------------------------------- /src/biomcp/rate_limiter.py: -------------------------------------------------------------------------------- ```python 1 | """Rate limiting implementation for BioMCP API calls.""" 2 | 3 | import asyncio 4 | import time 5 | from collections import defaultdict 6 | from contextlib import asynccontextmanager 7 | 8 | from .constants import ( 9 | DEFAULT_BURST_SIZE, 10 | DEFAULT_RATE_LIMIT_PER_SECOND, 11 | ) 12 | from .exceptions import BioMCPError 13 | 14 | 15 | class RateLimitExceeded(BioMCPError): 16 | """Raised when rate limit is exceeded.""" 17 | 18 | def __init__(self, domain: str, limit: int, window: int): 19 | message = f"Rate limit exceeded for {domain}: {limit} requests per {window} seconds" 20 | super().__init__( 21 | message, {"domain": domain, "limit": limit, "window": window} 22 | ) 23 | 24 | 25 | class RateLimiter: 26 | """Token bucket rate limiter implementation.""" 27 | 28 | def __init__( 29 | self, 30 | requests_per_second: float = DEFAULT_RATE_LIMIT_PER_SECOND, 31 | burst_size: int = DEFAULT_BURST_SIZE, 32 | ): 33 | """Initialize rate limiter. 34 | 35 | Args: 36 | requests_per_second: Sustained request rate 37 | burst_size: Maximum burst capacity 38 | """ 39 | self.rate = requests_per_second 40 | self.burst_size = burst_size 41 | self.tokens = float(burst_size) 42 | self.last_update = time.monotonic() 43 | self._lock = asyncio.Lock() 44 | 45 | async def acquire(self, tokens: int = 1) -> None: 46 | """Acquire tokens from the bucket.""" 47 | async with self._lock: 48 | now = time.monotonic() 49 | elapsed = now - self.last_update 50 | self.last_update = now 51 | 52 | # Add tokens based on elapsed time 53 | self.tokens = min( 54 | self.burst_size, self.tokens + elapsed * self.rate 55 | ) 56 | 57 | if self.tokens < tokens: 58 | # Calculate wait time 59 | wait_time = (tokens - self.tokens) / self.rate 60 | await asyncio.sleep(wait_time) 61 | self.tokens = 0 62 | else: 63 | self.tokens -= tokens 64 | 65 | @asynccontextmanager 66 | async def limit(self): 67 | """Context manager for rate limiting.""" 68 | await self.acquire() 69 | yield 70 | 71 | 72 | class DomainRateLimiter: 73 | """Rate limiter with per-domain limits.""" 74 | 75 | def __init__(self, default_rps: float = 10.0, default_burst: int = 20): 76 | """Initialize domain rate limiter. 77 | 78 | Args: 79 | default_rps: Default requests per second 80 | default_burst: Default burst size 81 | """ 82 | self.default_rps = default_rps 83 | self.default_burst = default_burst 84 | self.limiters: dict[str, RateLimiter] = {} 85 | self.domain_configs = { 86 | "article": {"rps": 20.0, "burst": 40}, # PubMed can handle more 87 | "trial": {"rps": 10.0, "burst": 20}, # ClinicalTrials.gov standard 88 | "thinking": {"rps": 50.0, "burst": 100}, # Local processing 89 | "mygene": {"rps": 10.0, "burst": 20}, # MyGene.info 90 | "mydisease": {"rps": 10.0, "burst": 20}, # MyDisease.info 91 | "mychem": {"rps": 10.0, "burst": 20}, # MyChem.info 92 | "myvariant": {"rps": 15.0, "burst": 30}, # MyVariant.info 93 | } 94 | 95 | def get_limiter(self, domain: str) -> RateLimiter: 96 | """Get or create rate limiter for domain.""" 97 | if domain not in self.limiters: 98 | config = self.domain_configs.get(domain, {}) 99 | rps = config.get("rps", self.default_rps) 100 | burst = config.get("burst", self.default_burst) 101 | self.limiters[domain] = RateLimiter(rps, int(burst)) 102 | return self.limiters[domain] 103 | 104 | @asynccontextmanager 105 | async def limit(self, domain: str): 106 | """Rate limit context manager for a domain.""" 107 | limiter = self.get_limiter(domain) 108 | async with limiter.limit(): 109 | yield 110 | 111 | 112 | class SlidingWindowRateLimiter: 113 | """Sliding window rate limiter for user/IP based limiting.""" 114 | 115 | def __init__(self, requests: int = 100, window_seconds: int = 60): 116 | """Initialize sliding window rate limiter. 117 | 118 | Args: 119 | requests: Maximum requests per window 120 | window_seconds: Window size in seconds 121 | """ 122 | self.max_requests = requests 123 | self.window_seconds = window_seconds 124 | self.requests: dict[str, list[float]] = defaultdict(list) 125 | self._lock = asyncio.Lock() 126 | 127 | async def check_limit(self, key: str) -> bool: 128 | """Check if request is allowed for key.""" 129 | async with self._lock: 130 | now = time.time() 131 | cutoff = now - self.window_seconds 132 | 133 | # Remove old requests 134 | self.requests[key] = [ 135 | req_time 136 | for req_time in self.requests[key] 137 | if req_time > cutoff 138 | ] 139 | 140 | # Check limit 141 | if len(self.requests[key]) >= self.max_requests: 142 | return False 143 | 144 | # Add current request 145 | self.requests[key].append(now) 146 | return True 147 | 148 | async def acquire(self, key: str) -> None: 149 | """Acquire permission to make request.""" 150 | if not await self.check_limit(key): 151 | raise RateLimitExceeded( 152 | key, self.max_requests, self.window_seconds 153 | ) 154 | 155 | 156 | # Global instances 157 | domain_limiter = DomainRateLimiter() 158 | user_limiter = SlidingWindowRateLimiter( 159 | requests=1000, window_seconds=3600 160 | ) # 1000 req/hour 161 | 162 | 163 | async def rate_limit_domain(domain: str) -> None: 164 | """Apply rate limiting for a domain.""" 165 | async with domain_limiter.limit(domain): 166 | pass 167 | 168 | 169 | async def rate_limit_user(user_id: str | None = None) -> None: 170 | """Apply rate limiting for a user.""" 171 | if user_id: 172 | await user_limiter.acquire(user_id) 173 | ``` -------------------------------------------------------------------------------- /src/biomcp/http_client_simple.py: -------------------------------------------------------------------------------- ```python 1 | """Helper functions for simpler HTTP client operations.""" 2 | 3 | import asyncio 4 | import contextlib 5 | import json 6 | import os 7 | import ssl 8 | 9 | import httpx 10 | 11 | # Global connection pools per SSL context 12 | _connection_pools: dict[str, httpx.AsyncClient] = {} 13 | _pool_lock = asyncio.Lock() 14 | 15 | 16 | def close_all_pools(): 17 | """Close all connection pools. Useful for cleanup in tests.""" 18 | global _connection_pools 19 | for pool in _connection_pools.values(): 20 | if pool and not pool.is_closed: 21 | # Schedule the close in a safe way 22 | try: 23 | # Store task reference to avoid garbage collection 24 | close_task = asyncio.create_task(pool.aclose()) 25 | # Optionally add a callback to handle completion 26 | close_task.add_done_callback(lambda t: None) 27 | except RuntimeError: 28 | # If no event loop is running, close synchronously 29 | pool._transport.close() 30 | _connection_pools.clear() 31 | 32 | 33 | async def get_connection_pool( 34 | verify: ssl.SSLContext | str | bool, 35 | timeout: httpx.Timeout, 36 | ) -> httpx.AsyncClient: 37 | """Get or create a shared connection pool for the given SSL context.""" 38 | global _connection_pools 39 | 40 | # Create a key for the pool based on verify setting 41 | if isinstance(verify, ssl.SSLContext): 42 | pool_key = f"ssl_{id(verify)}" 43 | else: 44 | pool_key = str(verify) 45 | 46 | async with _pool_lock: 47 | pool = _connection_pools.get(pool_key) 48 | if pool is None or pool.is_closed: 49 | # Create a new connection pool with optimized settings 50 | pool = httpx.AsyncClient( 51 | verify=verify, 52 | http2=False, # HTTP/2 can add overhead for simple requests 53 | timeout=timeout, 54 | limits=httpx.Limits( 55 | max_keepalive_connections=20, # Reuse connections 56 | max_connections=100, # Total connection limit 57 | keepalive_expiry=30, # Keep connections alive for 30s 58 | ), 59 | # Enable connection pooling 60 | transport=httpx.AsyncHTTPTransport( 61 | retries=0, # We handle retries at a higher level 62 | ), 63 | ) 64 | _connection_pools[pool_key] = pool 65 | return pool 66 | 67 | 68 | async def execute_http_request( # noqa: C901 69 | method: str, 70 | url: str, 71 | params: dict, 72 | verify: ssl.SSLContext | str | bool, 73 | headers: dict[str, str] | None = None, 74 | ) -> tuple[int, str]: 75 | """Execute the actual HTTP request using connection pooling. 76 | 77 | Args: 78 | method: HTTP method (GET or POST) 79 | url: Target URL 80 | params: Request parameters 81 | verify: SSL verification settings 82 | headers: Optional custom headers 83 | 84 | Returns: 85 | Tuple of (status_code, response_text) 86 | 87 | Raises: 88 | ConnectionError: For connection failures 89 | TimeoutError: For timeout errors 90 | """ 91 | from .constants import HTTP_TIMEOUT_SECONDS 92 | 93 | try: 94 | # Extract custom headers from params if present 95 | custom_headers = headers or {} 96 | if "_headers" in params: 97 | with contextlib.suppress(json.JSONDecodeError, TypeError): 98 | custom_headers.update(json.loads(params.pop("_headers"))) 99 | 100 | # Use the configured timeout from constants 101 | timeout = httpx.Timeout(HTTP_TIMEOUT_SECONDS) 102 | 103 | # Use connection pooling with proper error handling 104 | use_pool = ( 105 | os.getenv("BIOMCP_USE_CONNECTION_POOL", "true").lower() == "true" 106 | ) 107 | 108 | if use_pool: 109 | try: 110 | # Use the new connection pool manager 111 | from ..connection_pool import get_connection_pool as get_pool 112 | 113 | client = await get_pool(verify, timeout) 114 | should_close = False 115 | except Exception: 116 | # Fallback to creating a new client 117 | client = httpx.AsyncClient( 118 | verify=verify, http2=False, timeout=timeout 119 | ) 120 | should_close = True 121 | else: 122 | # Create a new client for each request 123 | client = httpx.AsyncClient( 124 | verify=verify, http2=False, timeout=timeout 125 | ) 126 | should_close = True 127 | 128 | try: 129 | # Make the request 130 | if method.upper() == "GET": 131 | resp = await client.get( 132 | url, params=params, headers=custom_headers 133 | ) 134 | elif method.upper() == "POST": 135 | resp = await client.post( 136 | url, json=params, headers=custom_headers 137 | ) 138 | else: 139 | from .constants import HTTP_ERROR_CODE_UNSUPPORTED_METHOD 140 | 141 | return ( 142 | HTTP_ERROR_CODE_UNSUPPORTED_METHOD, 143 | f"Unsupported method {method}", 144 | ) 145 | 146 | # Check for empty response 147 | if not resp.text: 148 | return resp.status_code, "{}" 149 | 150 | return resp.status_code, resp.text 151 | finally: 152 | # Only close if we created a new client 153 | if should_close: 154 | await client.aclose() 155 | 156 | except httpx.ConnectError as exc: 157 | raise ConnectionError(f"Failed to connect to {url}: {exc}") from exc 158 | except httpx.TimeoutException as exc: 159 | raise TimeoutError(f"Request to {url} timed out: {exc}") from exc 160 | except httpx.HTTPError as exc: 161 | error_msg = str(exc) if str(exc) else "Network connectivity error" 162 | from .constants import HTTP_ERROR_CODE_NETWORK 163 | 164 | return HTTP_ERROR_CODE_NETWORK, error_msg 165 | ``` -------------------------------------------------------------------------------- /docs/developer-guides/06-http-client-and-caching.md: -------------------------------------------------------------------------------- ```markdown 1 | # BioMCP HTTP Client Guide 2 | 3 | ## Overview 4 | 5 | BioMCP uses a centralized HTTP client for all external API calls. This provides: 6 | 7 | - Consistent error handling and retry logic 8 | - Request/response caching 9 | - Rate limiting per domain 10 | - Circuit breaker for fault tolerance 11 | - Offline mode support 12 | - Comprehensive endpoint tracking 13 | 14 | ## Migration from Direct HTTP Libraries 15 | 16 | ### Before (Direct httpx usage): 17 | 18 | ```python 19 | import httpx 20 | 21 | async def fetch_gene(gene: str): 22 | async with httpx.AsyncClient() as client: 23 | response = await client.get(f"https://api.example.com/genes/{gene}") 24 | response.raise_for_status() 25 | return response.json() 26 | ``` 27 | 28 | ### After (Centralized client): 29 | 30 | ```python 31 | from biomcp import http_client 32 | 33 | async def fetch_gene(gene: str): 34 | data, error = await http_client.request_api( 35 | url=f"https://api.example.com/genes/{gene}", 36 | request={}, 37 | domain="example" 38 | ) 39 | if error: 40 | # Handle error consistently 41 | return None 42 | return data 43 | ``` 44 | 45 | ## Error Handling 46 | 47 | The centralized client uses a consistent error handling pattern: 48 | 49 | ```python 50 | result, error = await http_client.request_api(...) 51 | 52 | if error: 53 | # error is a RequestError object with: 54 | # - error.code: HTTP status code or error type 55 | # - error.message: Human-readable error message 56 | # - error.details: Additional context 57 | logger.error(f"Request failed: {error.message}") 58 | return None # or handle appropriately 59 | ``` 60 | 61 | ### Error Handling Guidelines 62 | 63 | 1. **For optional data**: Return `None` when the data is not critical 64 | 2. **For required data**: Raise an exception or return an error to the caller 65 | 3. **For batch operations**: Collect errors and report at the end 66 | 4. **For user-facing operations**: Provide clear, actionable error messages 67 | 68 | ## Creating Domain-Specific Adapters 69 | 70 | For complex APIs, create an adapter class: 71 | 72 | ```python 73 | from biomcp import http_client 74 | from biomcp.http_client import RequestError 75 | 76 | class MyAPIAdapter: 77 | """Adapter for MyAPI using centralized HTTP client.""" 78 | 79 | def __init__(self): 80 | self.base_url = "https://api.example.com" 81 | 82 | async def get_resource(self, resource_id: str) -> tuple[dict | None, RequestError | None]: 83 | """Fetch a resource by ID. 84 | 85 | Returns: 86 | Tuple of (data, error) where one is always None 87 | """ 88 | return await http_client.request_api( 89 | url=f"{self.base_url}/resources/{resource_id}", 90 | request={}, 91 | domain="example", 92 | endpoint_key="example_resources" 93 | ) 94 | ``` 95 | 96 | ## Configuration 97 | 98 | ### Cache TTL (Time To Live) 99 | 100 | ```python 101 | # Cache for 1 hour (3600 seconds) 102 | data, error = await http_client.request_api( 103 | url=url, 104 | request=request, 105 | cache_ttl=3600 106 | ) 107 | 108 | # Disable caching for this request 109 | data, error = await http_client.request_api( 110 | url=url, 111 | request=request, 112 | cache_ttl=0 113 | ) 114 | ``` 115 | 116 | ### Rate Limiting 117 | 118 | Rate limits are configured per domain in `http_client.py`: 119 | 120 | ```python 121 | # Default rate limits 122 | rate_limits = { 123 | "ncbi.nlm.nih.gov": 20, # 20 requests/second 124 | "clinicaltrials.gov": 10, # 10 requests/second 125 | "myvariant.info": 1000/3600, # 1000 requests/hour 126 | } 127 | ``` 128 | 129 | ### Circuit Breaker 130 | 131 | The circuit breaker prevents cascading failures: 132 | 133 | - **Closed**: Normal operation 134 | - **Open**: Failing fast after threshold exceeded 135 | - **Half-Open**: Testing if service recovered 136 | 137 | Configure thresholds: 138 | 139 | ```python 140 | CIRCUIT_BREAKER_FAILURE_THRESHOLD = 5 # Open after 5 failures 141 | CIRCUIT_BREAKER_RECOVERY_TIMEOUT = 60 # Try again after 60 seconds 142 | ``` 143 | 144 | ## Offline Mode 145 | 146 | Enable offline mode to only serve cached responses: 147 | 148 | ```bash 149 | export BIOMCP_OFFLINE=true 150 | biomcp run 151 | ``` 152 | 153 | In offline mode: 154 | 155 | - Only cached responses are returned 156 | - No external HTTP requests are made 157 | - Missing cache entries return None with appropriate error 158 | 159 | ## Performance Tuning 160 | 161 | ### Connection Pooling 162 | 163 | The HTTP client maintains connection pools per domain: 164 | 165 | ```python 166 | # Configure in http_client_simple.py 167 | limits = httpx.Limits( 168 | max_keepalive_connections=20, 169 | max_connections=100, 170 | keepalive_expiry=30 171 | ) 172 | ``` 173 | 174 | ### Concurrent Requests 175 | 176 | For parallel requests to the same API: 177 | 178 | ```python 179 | import asyncio 180 | 181 | # Fetch multiple resources concurrently 182 | tasks = [ 183 | http_client.request_api(f"/resource/{i}", {}, domain="example") 184 | for i in range(10) 185 | ] 186 | results = await asyncio.gather(*tasks) 187 | ``` 188 | 189 | ## Monitoring and Debugging 190 | 191 | ### Request Metrics 192 | 193 | The client tracks metrics per endpoint: 194 | 195 | - Request count 196 | - Error count 197 | - Cache hit/miss ratio 198 | - Average response time 199 | 200 | Access metrics: 201 | 202 | ```python 203 | from biomcp.http_client import get_metrics 204 | metrics = get_metrics() 205 | ``` 206 | 207 | ### Debug Logging 208 | 209 | Enable debug logging to see all HTTP requests: 210 | 211 | ```python 212 | import logging 213 | logging.getLogger("biomcp.http_client").setLevel(logging.DEBUG) 214 | ``` 215 | 216 | ## Best Practices 217 | 218 | 1. **Always use the centralized client** for external HTTP calls 219 | 2. **Register new endpoints** in the endpoint registry 220 | 3. **Set appropriate cache TTLs** based on data volatility 221 | 4. **Handle errors gracefully** with user-friendly messages 222 | 5. **Test with offline mode** to ensure cache coverage 223 | 6. **Monitor rate limits** to avoid API throttling 224 | 7. **Use domain-specific adapters** for complex APIs 225 | 226 | ## Endpoint Registration 227 | 228 | Register new endpoints in `endpoint_registry.py`: 229 | 230 | ```python 231 | registry.register( 232 | "my_api_endpoint", 233 | EndpointInfo( 234 | url="https://api.example.com/v1/data", 235 | category=EndpointCategory.BIOMEDICAL_LITERATURE, 236 | data_types=[DataType.RESEARCH_ARTICLES], 237 | description="My API for fetching data", 238 | compliance_notes="Public API, no PII", 239 | rate_limit="100 requests/minute" 240 | ) 241 | ) 242 | ``` 243 | 244 | This ensures the endpoint is documented and tracked properly. 245 | ``` -------------------------------------------------------------------------------- /tests/tdd/articles/test_cbioportal_integration.py: -------------------------------------------------------------------------------- ```python 1 | """Test cBioPortal integration with article searches.""" 2 | 3 | import json 4 | 5 | import pytest 6 | 7 | from biomcp.articles.search import PubmedRequest 8 | from biomcp.articles.unified import search_articles_unified 9 | 10 | 11 | class TestArticleCBioPortalIntegration: 12 | """Test that cBioPortal summaries appear in article searches.""" 13 | 14 | @pytest.mark.asyncio 15 | @pytest.mark.integration 16 | async def test_article_search_with_gene_includes_cbioportal(self): 17 | """Test that searching articles for a gene includes cBioPortal summary.""" 18 | request = PubmedRequest( 19 | genes=["BRAF"], 20 | keywords=["melanoma"], 21 | ) 22 | 23 | # Test markdown output 24 | result = await search_articles_unified( 25 | request, 26 | include_pubmed=True, 27 | include_preprints=False, 28 | output_json=False, 29 | ) 30 | 31 | # Should include cBioPortal summary 32 | assert "cBioPortal Summary for BRAF" in result 33 | assert "Mutation Frequency" in result 34 | # Top Hotspots is only included when mutations are found 35 | # When cBioPortal API returns empty data, it won't be present 36 | if "0.0%" not in result: # If mutation frequency is not 0 37 | assert "Top Hotspots" in result 38 | assert "---" in result # Separator between summary and articles 39 | 40 | # Should still include article results 41 | assert "pmid" in result or "Title" in result or "Record" in result 42 | 43 | @pytest.mark.asyncio 44 | @pytest.mark.integration 45 | async def test_article_search_json_with_gene(self): 46 | """Test JSON output includes cBioPortal summary.""" 47 | request = PubmedRequest( 48 | genes=["TP53"], 49 | keywords=["cancer"], 50 | ) 51 | 52 | result = await search_articles_unified( 53 | request, 54 | include_pubmed=True, 55 | include_preprints=False, 56 | output_json=True, 57 | ) 58 | 59 | # Parse JSON 60 | data = json.loads(result) 61 | 62 | # Should have both summary and articles 63 | assert "cbioportal_summary" in data 64 | assert "articles" in data 65 | assert "TP53" in data["cbioportal_summary"] 66 | assert isinstance(data["articles"], list) 67 | assert len(data["articles"]) > 0 68 | 69 | @pytest.mark.asyncio 70 | @pytest.mark.integration 71 | async def test_article_search_without_gene_no_cbioportal(self): 72 | """Test that searches without genes don't include cBioPortal summary.""" 73 | request = PubmedRequest( 74 | diseases=["hypertension"], 75 | keywords=["treatment"], 76 | ) 77 | 78 | # Test markdown output 79 | result = await search_articles_unified( 80 | request, 81 | include_pubmed=True, 82 | include_preprints=False, 83 | output_json=False, 84 | ) 85 | 86 | # Should NOT include cBioPortal summary 87 | assert "cBioPortal Summary" not in result 88 | assert "Mutation Frequency" not in result 89 | 90 | @pytest.mark.asyncio 91 | @pytest.mark.integration 92 | async def test_article_search_multiple_genes(self): 93 | """Test that searching with multiple genes uses the first one.""" 94 | request = PubmedRequest( 95 | genes=["KRAS", "NRAS", "BRAF"], 96 | diseases=["colorectal cancer"], 97 | ) 98 | 99 | result = await search_articles_unified( 100 | request, 101 | include_pubmed=True, 102 | include_preprints=False, 103 | output_json=False, 104 | ) 105 | 106 | # Should include cBioPortal summary for KRAS (first gene) 107 | assert "cBioPortal Summary for KRAS" in result 108 | # Common KRAS hotspot 109 | assert "G12" in result or "mutation" in result 110 | 111 | @pytest.mark.asyncio 112 | @pytest.mark.integration 113 | async def test_article_search_with_invalid_gene(self): 114 | """Test graceful handling of invalid gene names.""" 115 | request = PubmedRequest( 116 | genes=["BRCA1"], # Valid gene 117 | keywords=["cancer"], 118 | ) 119 | 120 | # First check that we handle invalid genes gracefully 121 | # by using a real gene that might have cBioPortal data 122 | result = await search_articles_unified( 123 | request, 124 | include_pubmed=True, 125 | include_preprints=False, 126 | output_json=False, 127 | ) 128 | 129 | # Should have some content - either cBioPortal summary or articles 130 | assert len(result) > 50 # Some reasonable content 131 | 132 | # Now test with a gene that's valid for search but not in cBioPortal 133 | request2 = PubmedRequest( 134 | genes=["ACE2"], # Real gene but might not be in cancer studies 135 | keywords=["COVID-19"], 136 | ) 137 | 138 | result2 = await search_articles_unified( 139 | request2, 140 | include_pubmed=True, 141 | include_preprints=False, 142 | output_json=False, 143 | ) 144 | 145 | # Should return results even if cBioPortal data is not available 146 | assert len(result2) > 50 147 | 148 | @pytest.mark.asyncio 149 | @pytest.mark.integration 150 | async def test_article_search_with_preprints_and_cbioportal(self): 151 | """Test that cBioPortal summary works with preprint searches too.""" 152 | request = PubmedRequest( 153 | genes=["EGFR"], 154 | keywords=["lung cancer", "osimertinib"], 155 | ) 156 | 157 | result = await search_articles_unified( 158 | request, 159 | include_pubmed=True, 160 | include_preprints=True, 161 | output_json=False, 162 | ) 163 | 164 | # Should include cBioPortal summary 165 | assert "cBioPortal Summary for EGFR" in result 166 | # Should include both peer-reviewed and preprint results 167 | assert ("pmid" in result or "Title" in result) and ( 168 | "Preprint" in result 169 | or "bioRxiv" in result 170 | or "peer_reviewed" in result 171 | ) 172 | ``` -------------------------------------------------------------------------------- /src/biomcp/diseases/getter.py: -------------------------------------------------------------------------------- ```python 1 | """Disease information retrieval from MyDisease.info.""" 2 | 3 | import json 4 | import logging 5 | from typing import Annotated 6 | 7 | from pydantic import Field 8 | 9 | from ..integrations import BioThingsClient 10 | from ..render import to_markdown 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def _add_disease_links(disease_info, result: dict) -> None: 16 | """Add helpful links to disease result.""" 17 | links = {} 18 | 19 | # Add MONDO browser link if available 20 | if ( 21 | disease_info.mondo 22 | and isinstance(disease_info.mondo, dict) 23 | and (mondo_id := disease_info.mondo.get("mondo")) 24 | and isinstance(mondo_id, str) 25 | and mondo_id.startswith("MONDO:") 26 | ): 27 | links["MONDO Browser"] = ( 28 | f"https://www.ebi.ac.uk/ols/ontologies/mondo/terms?iri=http://purl.obolibrary.org/obo/{mondo_id.replace(':', '_')}" 29 | ) 30 | 31 | # Add Disease Ontology link if available 32 | if ( 33 | disease_info.xrefs 34 | and isinstance(disease_info.xrefs, dict) 35 | and (doid := disease_info.xrefs.get("doid")) 36 | ): 37 | if isinstance(doid, list) and doid: 38 | doid_id = doid[0] if isinstance(doid[0], str) else str(doid[0]) 39 | links["Disease Ontology"] = ( 40 | f"https://www.disease-ontology.org/?id={doid_id}" 41 | ) 42 | elif isinstance(doid, str): 43 | links["Disease Ontology"] = ( 44 | f"https://www.disease-ontology.org/?id={doid}" 45 | ) 46 | 47 | # Add PubMed search link 48 | if disease_info.name: 49 | links["PubMed Search"] = ( 50 | f"https://pubmed.ncbi.nlm.nih.gov/?term={disease_info.name.replace(' ', '+')}" 51 | ) 52 | 53 | if links: 54 | result["_links"] = links 55 | 56 | 57 | def _format_disease_output(disease_info, result: dict) -> None: 58 | """Format disease output for display.""" 59 | # Format synonyms nicely 60 | if disease_info.synonyms: 61 | result["synonyms"] = ", ".join( 62 | disease_info.synonyms[:10] 63 | ) # Limit to first 10 64 | if len(disease_info.synonyms) > 10: 65 | result["synonyms"] += ( 66 | f" (and {len(disease_info.synonyms) - 10} more)" 67 | ) 68 | 69 | # Format phenotypes if present 70 | if disease_info.phenotypes: 71 | # Just show count and first few phenotypes 72 | phenotype_names = [] 73 | for pheno in disease_info.phenotypes[:5]: 74 | if isinstance(pheno, dict) and "phenotype" in pheno: 75 | phenotype_names.append(pheno["phenotype"]) 76 | if phenotype_names: 77 | result["associated_phenotypes"] = ", ".join(phenotype_names) 78 | if len(disease_info.phenotypes) > 5: 79 | result["associated_phenotypes"] += ( 80 | f" (and {len(disease_info.phenotypes) - 5} more)" 81 | ) 82 | # Remove the raw phenotypes data for cleaner output 83 | result.pop("phenotypes", None) 84 | 85 | 86 | async def get_disease( 87 | disease_id_or_name: str, 88 | output_json: bool = False, 89 | ) -> str: 90 | """ 91 | Get disease information from MyDisease.info. 92 | 93 | Args: 94 | disease_id_or_name: Disease ID (MONDO, DOID) or name (e.g., "melanoma", "MONDO:0016575") 95 | output_json: Return as JSON instead of markdown 96 | 97 | Returns: 98 | Disease information as markdown or JSON string 99 | """ 100 | client = BioThingsClient() 101 | 102 | try: 103 | disease_info = await client.get_disease_info(disease_id_or_name) 104 | 105 | if not disease_info: 106 | error_data = { 107 | "error": f"Disease '{disease_id_or_name}' not found", 108 | "suggestion": "Please check the disease name or ID (MONDO:, DOID:, OMIM:, MESH:)", 109 | } 110 | return ( 111 | json.dumps(error_data, indent=2) 112 | if output_json 113 | else to_markdown([error_data]) 114 | ) 115 | 116 | # Convert to dict for rendering 117 | result = disease_info.model_dump(exclude_none=True) 118 | 119 | # Add helpful links 120 | _add_disease_links(disease_info, result) 121 | 122 | # Format output for display 123 | _format_disease_output(disease_info, result) 124 | 125 | if output_json: 126 | return json.dumps(result, indent=2) 127 | else: 128 | return to_markdown([result]) 129 | 130 | except Exception as e: 131 | logger.error( 132 | f"Error fetching disease info for {disease_id_or_name}: {e}" 133 | ) 134 | error_data = { 135 | "error": "Failed to retrieve disease information", 136 | "details": str(e), 137 | } 138 | return ( 139 | json.dumps(error_data, indent=2) 140 | if output_json 141 | else to_markdown([error_data]) 142 | ) 143 | 144 | 145 | async def _disease_details( 146 | call_benefit: Annotated[ 147 | str, 148 | "Define and summarize why this function is being called and the intended benefit", 149 | ], 150 | disease_id_or_name: Annotated[ 151 | str, 152 | Field( 153 | description="Disease name (e.g., melanoma, GIST) or ID (e.g., MONDO:0016575, DOID:1909)" 154 | ), 155 | ], 156 | ) -> str: 157 | """ 158 | Retrieves detailed information for a disease from MyDisease.info. 159 | 160 | This tool provides real-time disease annotations including: 161 | - Official disease name and definition 162 | - Disease synonyms and alternative names 163 | - Ontology mappings (MONDO, DOID, OMIM, etc.) 164 | - Associated phenotypes 165 | - Links to disease databases 166 | 167 | Parameters: 168 | - call_benefit: Define why this function is being called 169 | - disease_id_or_name: Disease name or ontology ID 170 | 171 | Process: Queries MyDisease.info API for up-to-date disease information 172 | Output: Markdown formatted disease information with definition and metadata 173 | 174 | Note: For clinical trials about diseases, use trial_searcher. For articles about diseases, use article_searcher. 175 | """ 176 | return await get_disease(disease_id_or_name, output_json=False) 177 | ```