This is page 15 of 20. Use http://codebase.md/genomoncology/biomcp?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .github
│ ├── actions
│ │ └── setup-python-env
│ │ └── action.yml
│ ├── dependabot.yml
│ └── workflows
│ ├── ci.yml
│ ├── deploy-docs.yml
│ ├── main.yml.disabled
│ ├── on-release-main.yml
│ └── validate-codecov-config.yml
├── .gitignore
├── .pre-commit-config.yaml
├── BIOMCP_DATA_FLOW.md
├── CHANGELOG.md
├── CNAME
├── codecov.yaml
├── docker-compose.yml
├── Dockerfile
├── docs
│ ├── apis
│ │ ├── error-codes.md
│ │ ├── overview.md
│ │ └── python-sdk.md
│ ├── assets
│ │ ├── biomcp-cursor-locations.png
│ │ ├── favicon.ico
│ │ ├── icon.png
│ │ ├── logo.png
│ │ ├── mcp_architecture.txt
│ │ └── remote-connection
│ │ ├── 00_connectors.png
│ │ ├── 01_add_custom_connector.png
│ │ ├── 02_connector_enabled.png
│ │ ├── 03_connect_to_biomcp.png
│ │ ├── 04_select_google_oauth.png
│ │ └── 05_success_connect.png
│ ├── backend-services-reference
│ │ ├── 01-overview.md
│ │ ├── 02-biothings-suite.md
│ │ ├── 03-cbioportal.md
│ │ ├── 04-clinicaltrials-gov.md
│ │ ├── 05-nci-cts-api.md
│ │ ├── 06-pubtator3.md
│ │ └── 07-alphagenome.md
│ ├── blog
│ │ ├── ai-assisted-clinical-trial-search-analysis.md
│ │ ├── images
│ │ │ ├── deep-researcher-video.png
│ │ │ ├── researcher-announce.png
│ │ │ ├── researcher-drop-down.png
│ │ │ ├── researcher-prompt.png
│ │ │ ├── trial-search-assistant.png
│ │ │ └── what_is_biomcp_thumbnail.png
│ │ └── researcher-persona-resource.md
│ ├── changelog.md
│ ├── CNAME
│ ├── concepts
│ │ ├── 01-what-is-biomcp.md
│ │ ├── 02-the-deep-researcher-persona.md
│ │ └── 03-sequential-thinking-with-the-think-tool.md
│ ├── developer-guides
│ │ ├── 01-server-deployment.md
│ │ ├── 02-contributing-and-testing.md
│ │ ├── 03-third-party-endpoints.md
│ │ ├── 04-transport-protocol.md
│ │ ├── 05-error-handling.md
│ │ ├── 06-http-client-and-caching.md
│ │ ├── 07-performance-optimizations.md
│ │ └── generate_endpoints.py
│ ├── faq-condensed.md
│ ├── FDA_SECURITY.md
│ ├── genomoncology.md
│ ├── getting-started
│ │ ├── 01-quickstart-cli.md
│ │ ├── 02-claude-desktop-integration.md
│ │ └── 03-authentication-and-api-keys.md
│ ├── how-to-guides
│ │ ├── 01-find-articles-and-cbioportal-data.md
│ │ ├── 02-find-trials-with-nci-and-biothings.md
│ │ ├── 03-get-comprehensive-variant-annotations.md
│ │ ├── 04-predict-variant-effects-with-alphagenome.md
│ │ ├── 05-logging-and-monitoring-with-bigquery.md
│ │ └── 06-search-nci-organizations-and-interventions.md
│ ├── index.md
│ ├── policies.md
│ ├── reference
│ │ ├── architecture-diagrams.md
│ │ ├── quick-architecture.md
│ │ ├── quick-reference.md
│ │ └── visual-architecture.md
│ ├── robots.txt
│ ├── stylesheets
│ │ ├── announcement.css
│ │ └── extra.css
│ ├── troubleshooting.md
│ ├── tutorials
│ │ ├── biothings-prompts.md
│ │ ├── claude-code-biomcp-alphagenome.md
│ │ ├── nci-prompts.md
│ │ ├── openfda-integration.md
│ │ ├── openfda-prompts.md
│ │ ├── pydantic-ai-integration.md
│ │ └── remote-connection.md
│ ├── user-guides
│ │ ├── 01-command-line-interface.md
│ │ ├── 02-mcp-tools-reference.md
│ │ └── 03-integrating-with-ides-and-clients.md
│ └── workflows
│ └── all-workflows.md
├── example_scripts
│ ├── mcp_integration.py
│ └── python_sdk.py
├── glama.json
├── LICENSE
├── lzyank.toml
├── Makefile
├── mkdocs.yml
├── package-lock.json
├── package.json
├── pyproject.toml
├── README.md
├── scripts
│ ├── check_docs_in_mkdocs.py
│ ├── check_http_imports.py
│ └── generate_endpoints_doc.py
├── smithery.yaml
├── src
│ └── biomcp
│ ├── __init__.py
│ ├── __main__.py
│ ├── articles
│ │ ├── __init__.py
│ │ ├── autocomplete.py
│ │ ├── fetch.py
│ │ ├── preprints.py
│ │ ├── search_optimized.py
│ │ ├── search.py
│ │ └── unified.py
│ ├── biomarkers
│ │ ├── __init__.py
│ │ └── search.py
│ ├── cbioportal_helper.py
│ ├── circuit_breaker.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── articles.py
│ │ ├── biomarkers.py
│ │ ├── diseases.py
│ │ ├── health.py
│ │ ├── interventions.py
│ │ ├── main.py
│ │ ├── openfda.py
│ │ ├── organizations.py
│ │ ├── server.py
│ │ ├── trials.py
│ │ └── variants.py
│ ├── connection_pool.py
│ ├── constants.py
│ ├── core.py
│ ├── diseases
│ │ ├── __init__.py
│ │ ├── getter.py
│ │ └── search.py
│ ├── domain_handlers.py
│ ├── drugs
│ │ ├── __init__.py
│ │ └── getter.py
│ ├── exceptions.py
│ ├── genes
│ │ ├── __init__.py
│ │ └── getter.py
│ ├── http_client_simple.py
│ ├── http_client.py
│ ├── individual_tools.py
│ ├── integrations
│ │ ├── __init__.py
│ │ ├── biothings_client.py
│ │ └── cts_api.py
│ ├── interventions
│ │ ├── __init__.py
│ │ ├── getter.py
│ │ └── search.py
│ ├── logging_filter.py
│ ├── metrics_handler.py
│ ├── metrics.py
│ ├── oncokb_helper.py
│ ├── openfda
│ │ ├── __init__.py
│ │ ├── adverse_events_helpers.py
│ │ ├── adverse_events.py
│ │ ├── cache.py
│ │ ├── constants.py
│ │ ├── device_events_helpers.py
│ │ ├── device_events.py
│ │ ├── drug_approvals.py
│ │ ├── drug_labels_helpers.py
│ │ ├── drug_labels.py
│ │ ├── drug_recalls_helpers.py
│ │ ├── drug_recalls.py
│ │ ├── drug_shortages_detail_helpers.py
│ │ ├── drug_shortages_helpers.py
│ │ ├── drug_shortages.py
│ │ ├── exceptions.py
│ │ ├── input_validation.py
│ │ ├── rate_limiter.py
│ │ ├── utils.py
│ │ └── validation.py
│ ├── organizations
│ │ ├── __init__.py
│ │ ├── getter.py
│ │ └── search.py
│ ├── parameter_parser.py
│ ├── query_parser.py
│ ├── query_router.py
│ ├── rate_limiter.py
│ ├── render.py
│ ├── request_batcher.py
│ ├── resources
│ │ ├── __init__.py
│ │ ├── getter.py
│ │ ├── instructions.md
│ │ └── researcher.md
│ ├── retry.py
│ ├── router_handlers.py
│ ├── router.py
│ ├── shared_context.py
│ ├── thinking
│ │ ├── __init__.py
│ │ ├── sequential.py
│ │ └── session.py
│ ├── thinking_tool.py
│ ├── thinking_tracker.py
│ ├── trials
│ │ ├── __init__.py
│ │ ├── getter.py
│ │ ├── nci_getter.py
│ │ ├── nci_search.py
│ │ └── search.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── cancer_types_api.py
│ │ ├── cbio_http_adapter.py
│ │ ├── endpoint_registry.py
│ │ ├── gene_validator.py
│ │ ├── metrics.py
│ │ ├── mutation_filter.py
│ │ ├── query_utils.py
│ │ ├── rate_limiter.py
│ │ └── request_cache.py
│ ├── variants
│ │ ├── __init__.py
│ │ ├── alphagenome.py
│ │ ├── cancer_types.py
│ │ ├── cbio_external_client.py
│ │ ├── cbioportal_mutations.py
│ │ ├── cbioportal_search_helpers.py
│ │ ├── cbioportal_search.py
│ │ ├── constants.py
│ │ ├── external.py
│ │ ├── filters.py
│ │ ├── getter.py
│ │ ├── links.py
│ │ ├── oncokb_client.py
│ │ ├── oncokb_models.py
│ │ └── search.py
│ └── workers
│ ├── __init__.py
│ ├── worker_entry_stytch.js
│ ├── worker_entry.js
│ └── worker.py
├── tests
│ ├── bdd
│ │ ├── cli_help
│ │ │ ├── help.feature
│ │ │ └── test_help.py
│ │ ├── conftest.py
│ │ ├── features
│ │ │ └── alphagenome_integration.feature
│ │ ├── fetch_articles
│ │ │ ├── fetch.feature
│ │ │ └── test_fetch.py
│ │ ├── get_trials
│ │ │ ├── get.feature
│ │ │ └── test_get.py
│ │ ├── get_variants
│ │ │ ├── get.feature
│ │ │ └── test_get.py
│ │ ├── search_articles
│ │ │ ├── autocomplete.feature
│ │ │ ├── search.feature
│ │ │ ├── test_autocomplete.py
│ │ │ └── test_search.py
│ │ ├── search_trials
│ │ │ ├── search.feature
│ │ │ └── test_search.py
│ │ ├── search_variants
│ │ │ ├── search.feature
│ │ │ └── test_search.py
│ │ └── steps
│ │ └── test_alphagenome_steps.py
│ ├── config
│ │ └── test_smithery_config.py
│ ├── conftest.py
│ ├── data
│ │ ├── ct_gov
│ │ │ ├── clinical_trials_api_v2.yaml
│ │ │ ├── trials_NCT04280705.json
│ │ │ └── trials_NCT04280705.txt
│ │ ├── myvariant
│ │ │ ├── myvariant_api.yaml
│ │ │ ├── myvariant_field_descriptions.csv
│ │ │ ├── variants_full_braf_v600e.json
│ │ │ ├── variants_full_braf_v600e.txt
│ │ │ └── variants_part_braf_v600_multiple.json
│ │ ├── oncokb_mock_responses.json
│ │ ├── openfda
│ │ │ ├── drugsfda_detail.json
│ │ │ ├── drugsfda_search.json
│ │ │ ├── enforcement_detail.json
│ │ │ └── enforcement_search.json
│ │ └── pubtator
│ │ ├── pubtator_autocomplete.json
│ │ └── pubtator3_paper.txt
│ ├── integration
│ │ ├── test_oncokb_integration.py
│ │ ├── test_openfda_integration.py
│ │ ├── test_preprints_integration.py
│ │ ├── test_simple.py
│ │ └── test_variants_integration.py
│ ├── tdd
│ │ ├── articles
│ │ │ ├── test_autocomplete.py
│ │ │ ├── test_cbioportal_integration.py
│ │ │ ├── test_fetch.py
│ │ │ ├── test_preprints.py
│ │ │ ├── test_search.py
│ │ │ └── test_unified.py
│ │ ├── conftest.py
│ │ ├── drugs
│ │ │ ├── __init__.py
│ │ │ └── test_drug_getter.py
│ │ ├── openfda
│ │ │ ├── __init__.py
│ │ │ ├── test_adverse_events.py
│ │ │ ├── test_device_events.py
│ │ │ ├── test_drug_approvals.py
│ │ │ ├── test_drug_labels.py
│ │ │ ├── test_drug_recalls.py
│ │ │ ├── test_drug_shortages.py
│ │ │ └── test_security.py
│ │ ├── test_biothings_integration_real.py
│ │ ├── test_biothings_integration.py
│ │ ├── test_circuit_breaker.py
│ │ ├── test_concurrent_requests.py
│ │ ├── test_connection_pool.py
│ │ ├── test_domain_handlers.py
│ │ ├── test_drug_approvals.py
│ │ ├── test_drug_recalls.py
│ │ ├── test_drug_shortages.py
│ │ ├── test_endpoint_documentation.py
│ │ ├── test_error_scenarios.py
│ │ ├── test_europe_pmc_fetch.py
│ │ ├── test_mcp_integration.py
│ │ ├── test_mcp_tools.py
│ │ ├── test_metrics.py
│ │ ├── test_nci_integration.py
│ │ ├── test_nci_mcp_tools.py
│ │ ├── test_network_policies.py
│ │ ├── test_offline_mode.py
│ │ ├── test_openfda_unified.py
│ │ ├── test_pten_r173_search.py
│ │ ├── test_render.py
│ │ ├── test_request_batcher.py.disabled
│ │ ├── test_retry.py
│ │ ├── test_router.py
│ │ ├── test_shared_context.py.disabled
│ │ ├── test_unified_biothings.py
│ │ ├── thinking
│ │ │ ├── __init__.py
│ │ │ └── test_sequential.py
│ │ ├── trials
│ │ │ ├── test_backward_compatibility.py
│ │ │ ├── test_getter.py
│ │ │ └── test_search.py
│ │ ├── utils
│ │ │ ├── test_gene_validator.py
│ │ │ ├── test_mutation_filter.py
│ │ │ ├── test_rate_limiter.py
│ │ │ └── test_request_cache.py
│ │ ├── variants
│ │ │ ├── constants.py
│ │ │ ├── test_alphagenome_api_key.py
│ │ │ ├── test_alphagenome_comprehensive.py
│ │ │ ├── test_alphagenome.py
│ │ │ ├── test_cbioportal_mutations.py
│ │ │ ├── test_cbioportal_search.py
│ │ │ ├── test_external_integration.py
│ │ │ ├── test_external.py
│ │ │ ├── test_extract_gene_aa_change.py
│ │ │ ├── test_filters.py
│ │ │ ├── test_getter.py
│ │ │ ├── test_links.py
│ │ │ ├── test_oncokb_client.py
│ │ │ ├── test_oncokb_helper.py
│ │ │ └── test_search.py
│ │ └── workers
│ │ └── test_worker_sanitization.js
│ └── test_pydantic_ai_integration.py
├── THIRD_PARTY_ENDPOINTS.md
├── tox.ini
├── uv.lock
└── wrangler.toml
```
# Files
--------------------------------------------------------------------------------
/tests/data/pubtator/pubtator3_paper.txt:
--------------------------------------------------------------------------------
```
1 | Nucleic Acids Research, 2024, 52, W540–W546
2 | https://doi.org/10.1093/nar/gkae235
3 | Advance access publication date: 4 April 2024
4 | Web Server issue
5 |
6 | PubTator 3.0: an AI-powered literature resource for
7 | unlocking biomedical knowledge
8 | Chih-Hsuan Wei † , Alexis Allot † , Po-Ting Lai , Robert Leaman , Shubo Tian , Ling Luo ,
9 | Qiao Jin , Zhizheng Wang , Qingyu Chen and Zhiyong Lu *
10 | National Center for Biotechnology Information (NCBI), National Library of Medicine (NLM), National Institutes of Health (NIH),
11 | Bethesda, MD 20894, USA
12 | To whom correspondence should be addressed. Tel: +1 301 594 7089; Email: [email protected]
13 | The first two authors should be regarded as Joint First Authors.
14 | Present addresses:
15 | Alexis Allot, The Neuro (Montreal Neurological Institute-Hospital), McGill University, Montreal, Quebec H3A 2B4, Canada.
16 | Ling Luo, School of Computer Science and Technology, Dalian University of Technology, 116024 Dalian, China.
17 | Qingyu Chen, Biomedical Informatics and Data Science, Yale School of Medicine, New Haven, CT 06510, USA.
18 | †
19 |
20 | Abstract
21 | PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a biomedical literature resource using state-of-the-art AI techniques to offer
22 | semantic and relation searches for key concepts like proteins, genetic variants, diseases and chemicals. It currently provides over one billion
23 | entity and relation annotations across approximately 36 million PubMed abstracts and 6 million full-text articles from the PMC open access
24 | subset, updated weekly. PubTator 3.0’s online interface and API utilize these precomputed entity relations and synonyms to provide advanced
25 | search capabilities and enable large-scale analyses, streamlining many complex information needs. We showcase the retrieval quality of PubTator
26 | 3.0 using a series of entity pair queries, demonstrating that PubTator 3.0 retrieves a greater number of articles than either PubMed or Google
27 | Scholar, with higher precision in the top 20 results. We further show that integrating ChatGPT (GPT-4) with PubTator APIs dramatically improves
28 | the factuality and verifiability of its responses. In summary, PubTator 3.0 offers a comprehensive set of features and tools that allow researchers
29 | to navigate the ever-expanding wealth of biomedical literature, expediting research and unlocking valuable insights for scientific discovery.
30 |
31 | Graphical abstract
32 |
33 | Introduction
34 | The biomedical literature is a primary resource to address information needs across the biological and clinical sciences (1),
35 | however the requirements for literature search vary widely.
36 | Activities such as formulating a research hypothesis require
37 | an exploratory approach, whereas tasks like interpreting the
38 | clinical significance of genetic variants are more focused.
39 | Traditional keyword-based search methods have long
40 | formed the foundation of biomedical literature search (2).
41 | While generally effective for basic search, these methods also
42 | have significant limitations, such as missing relevant articles
43 |
44 | due to differing terminology or including irrelevant articles because surface-level term matches cannot adequately represent
45 | the required association between query terms. These limitations cost time and risk information needs remaining unmet.
46 | Natural language processing (NLP) methods provide substantial value for creating bioinformatics resources (3–5), and
47 | may improve literature search by enabling semantic and relation search (6). In semantic search, users indicate specific
48 | concepts of interest (entities) for which the system has precomputed matches regardless of the terminology used. Relation search increases precision by allowing users to specify the
49 |
50 | Received: January 18, 2024. Revised: March 2, 2024. Editorial Decision: March 16, 2024. Accepted: March 21, 2024
51 | Published by Oxford University Press on behalf of Nucleic Acids Research 2024.
52 | This work is written by (a) US Government employee(s) and is in the public domain in the US.
53 |
54 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
55 |
56 | *
57 |
58 | W541
59 |
60 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
61 |
62 | type of relationship desired between entities, such as whether
63 | a chemical enhances or reduces expression of a gene. In this regard, we present PubTator 3.0, a novel resource engineered to
64 | support semantic and relation search in the biomedical literature. Its search capabilities allow users to explore automated
65 | entity annotations for six key biomedical entities: genes, diseases, chemicals, genetic variants, species, and cell lines. PubTator 3.0 also identifies and makes searchable 12 common
66 | types of relations between entities, enhancing its utility for
67 | both targeted and exploratory searches. Focusing on relations
68 | and entity types of interest across the biomedical sciences allows PubTator 3.0 to retrieve information precisely while providing broad utility (see detailed comparisons with its predecessor in Supplementary Table S1).
69 |
70 | The PubTator 3.0 online interface, illustrated in Figure 1
71 | and Supplementary Figure S1, is designed for interactive literature exploration, supporting semantic, relation, keyword,
72 | and Boolean queries. An auto-complete function provides semantic search suggestions to assist users with query formulation. For example, it automatically suggests replacing either ‘COVID-19 or "SARS-CoV-2 infection’ with the semantic term ‘@DISEASE_COVID_19 . Relation queries – new to
73 | PubTator 3.0 – provide increased precision, allowing users
74 | to target articles which discuss specific relationships between
75 | entities.
76 | PubTator 3.0 offers unified search results, simultaneously
77 | searching approximately 36 million PubMed abstracts and
78 | over 6 million full-text articles from the PMC Open Access Subset (PMC-OA), improving access to the substantial
79 | amount of relevant information present in the article full text
80 | (7). Search results are prioritized based on the depth of the relationship between the query terms: articles containing identifiable relations between semantic terms receive the highest
81 | priority, while articles where semantic or keyword terms cooccur nearby (e.g. within the same sentence) receive secondary
82 | priority. Search results are also prioritized based on the article
83 | section where the match appears (e.g. matches within the title receive higher priority). Users can further refine results by
84 | employing filters, narrowing articles returned to specific publication types, journals, or article sections.
85 | PubTator 3.0 is supported by an NLP pipeline, depicted in
86 | Figure 2A. This pipeline, run weekly, first identifies articles
87 | newly added to PubMed and PMC-OA. Articles are then processed through three major steps: (i) named entity recognition,
88 | provided by the recently developed deep-learning transformer
89 | model AIONER (8), (ii) identifier mapping and (iii) relation
90 | extraction, performed by BioREx (9) of 12 common types of
91 | relations (described in Supplementary Table S2).
92 | In total, PubTator 3.0 contains over 1.6 billion entity annotations (4.6 million unique identifiers) and 33 million relations
93 | (8.8 million unique pairs). It provides enhanced entity recognition and normalization performance over its previous version,
94 | PubTator 2 (10), also known as PubTator Central (Figure 2B
95 | and Supplementary Table S3). We show the relation extraction performance of PubTator 3.0 in Figure 2C and its comparison results to the previous state-of-the-art systems (11–13)
96 | on the BioCreative V Chemical-Disease Relation (14) corpus,
97 | finding that PubTator 3.0 provided substantially higher accuracy. Moreover, when evaluating a randomized sample of
98 | entity pair queries compared to PubMed and Google Scholar,
99 |
100 | Materials and methods
101 | Data sources and article processing
102 | PubTator 3.0 downloads new articles weekly from the BioC
103 | PubMed API (https://www.ncbi.nlm.nih.gov/research/bionlp/
104 | APIs/BioC-PubMed/) and the BioC PMC API (https://www.
105 | ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/) in BioCXML format (16). Local abbreviations are identified using
106 | Ab3P (17). Article text and extracted data are stored internally using MongoDB and indexed for search with Solr, ensuring robust and scalable accessibility unconstrained by external
107 | dependencies such as the NCBI eUtils API.
108 |
109 | Entity recognition and normalization/linking
110 | PubTator 3.0 uses AIONER (8), a recently developed named
111 | entity recognition (NER) model, to recognize entities of six
112 | types: genes/proteins, chemicals, diseases, species, genetic
113 | variants, and cell lines. AIONER utilizes a flexible tagging
114 | scheme to integrate training data created separately into a
115 | single resource. These training datasets include NLM-Gene
116 | (18), NLM-Chem (19), NCBI-Disease (20), BC5CDR (14),
117 | tmVar3 (21), Species-800 (22), BioID (23) and BioRED (15).
118 | This consolidation creates a larger training set, improving
119 | the model’s ability to generalize to unseen data. Furthermore,
120 | it enables recognizing multiple entity types simultaneously,
121 | enhancing efficiency and simplifying the challenge of distinguishing boundaries between entities that reference others,
122 | such as the disorder ‘Alpha-1 antitrypsin deficiency’ and the
123 | protein ‘Alpha-1 antitrypsin’. We previously evaluated the performance of AIONER on 14 benchmark datasets (8), including the test sets for the aforementioned training sets. This evaluation demonstrated that AIONER’s performance surpasses
124 | or matches previous state-of-the-art methods.
125 | Entity mentions found by AIONER are normalized (linked)
126 | to a unique identifier in an appropriate entity database. Normalization is performed by a module designed for (or adapted
127 | to) each entity type, using the latest version. The recentlyupgraded GNorm2 system (24) normalizes genes to NCBI
128 | Gene identifiers and species mentions to NCBI Taxonomy.
129 | tmVar3 (21), also recently upgraded, normalizes genetic variants; it uses dbSNP identifiers for variants listed in dbSNP
130 | and HGNV format otherwise. Chemicals are normalized by
131 | the NLM-Chem tagger (19) to MeSH identifiers (25). TaggerOne (26) normalizes diseases to MeSH and cell lines to
132 | Cellosaurus (27) using a new normalization-only mode. This
133 | mode only applies the normalization model, which converts
134 | both mentions and lexicon names into high-dimensional TFIDF vectors and learns a mapping, as before. However, it
135 | now augments the training data by mapping each lexicon
136 | name to itself, resulting in a large performance improvement for names present in the lexicon but not in the annotated training data. These enhancements provide a significant overall improvement in entity normalization performance (Supplementary Table S3).
137 |
138 | Relation extraction
139 | Relations for PubTator 3.0 are extracted by the unified relation extraction model BioREx (9), designed to simulta-
140 |
141 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
142 |
143 | System overview
144 |
145 | PubTator 3.0 consistently returns a greater number of articles with higher precision in the top 20 results (Figure 2D and
146 | Supplementary Table S4).
147 |
148 | W542
149 |
150 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
151 |
152 | neously extract 12 types of relations across eight entity
153 | type pairs: chemical–chemical, chemical–disease, chemical–
154 | gene, chemical–variant, disease–gene, disease–variant, gene–
155 | gene and variant–variant. Detailed definitions of these relation types and their corresponding entity pairs are presented in
156 | Supplementary Table S2. Deep-learning methods for relation
157 | extraction, such as BioREx, require ample training data. However, training data for relation extraction is fragmented into
158 | many datasets, often tailored to specific entity pairs. BioREx
159 | overcomes this limitation with a data-centric approach, reconciling discrepancies between disparate training datasets to
160 | construct a comprehensive, unified dataset.
161 | We evaluated the relations extracted by BioREx using performance on manually annotated relation extraction datasets
162 | as well as a comparative analysis between BioREx and notable
163 | comparable systems. BioREx established a new performance
164 | benchmark on the BioRED corpus test set (15), elevating the
165 | performance from 74.4% (F-score) to 79.6%, and demonstrating higher performance than alternative models such as
166 | transfer learning (TL), multi-task learning (MTL), and stateof-the-art models trained on isolated datasets (9). For PubTator 3.0, we replaced its deep learning module, PubMedBERT
167 | (28), with LinkBERT (29), further increasing the performance
168 | to 82.0%. Furthermore, we conducted a comparative analysis between BioREx and SemRep (11), a widely used rule-
169 |
170 | based method for extracting diverse relations, the CD-REST
171 | (13) system, and the previous state-of-the-art system (12), using the BioCreative V Chemical Disease Relation corpus test
172 | set (14). Our evaluation demonstrated that PubTator 3.0 provided substantially higher F-score than previous methods.
173 |
174 | Programmatic access and data formats
175 | PubTator 3.0 offers programmatic access through its
176 | API and bulk download. The API (https://www.ncbi.
177 | nlm.nih.gov/research/pubtator3/) supports keyword, entity and relation search, and also supports exporting
178 | annotations in XML and JSON-based BioC (16) formats and tab-delimited free text. The PubTator 3.0 FTP
179 | site (https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3) provides bulk downloads of annotated articles and extraction
180 | summaries for entities and relations. Programmatic access supports more flexible query options; for example,
181 | the information need ‘what chemicals reduce expression
182 | of JAK1?’ can be answered directly via API (e.g. https:
183 | //www.ncbi.nlm.nih.gov/research/pubtator3-api/relations?
184 | e1=@GENE_JAK1&type=negative_correlate&e2=Chemical)
185 | or by filtering the bulk relations file. Additionally, the PubTator 3.0 API supports annotation of user-defined free
186 | text.
187 |
188 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
189 |
190 | Figure 1. PubTator 3.0 system overview and search results page: 1. Query auto-complete enhances search accuracy and synonym matching. 2. Natural
191 | language processing (NLP)-enhanced relevance: Search results are prioritized according to the strength of the relationship between the entities queried.
192 | 3. Users can further refine results with facet filters—section, journal and type. 4. Search results include highlighted entity snippets explaining relevance.
193 | 5. Histogram visualizes number of results by publication year. 6. Entity highlighting can be switched on or off according to user preference.
194 |
195 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
196 |
197 | W543
198 |
199 | Case study I: entity relation queries
200 | We analyzed the retrieval quality of PubTator 3.0 by preparing a series of 12 entity pairs to serve as case studies for
201 | comparison between PubTator 3.0, PubMed and Google
202 | Scholar. To provide an equal comparison, we filtered about
203 | 30% of the Google Scholar results for articles not present
204 | in PubMed. To ensure that the number of results would
205 | remain low enough to allow filtering Google Scholar results for articles not in PubMed, we identified entity pairs
206 | first discussed together in the literature in 2022 or later. We
207 | then randomly selected two entity pairs of each of the following types: disease/gene, chemical/disease, chemical/gene,
208 | chemical/chemical, gene/gene and disease/variant. None of
209 |
210 | the relation pairs selected appears in the training set. The
211 | comparison was performed with respect to a snapshot of the
212 | search results returned by all search engines on 19 May 2023.
213 | We manually evaluated the top 20 results for each system and
214 | each query; articles were judged to be relevant if they mentioned both entities in the query and supported a relationship
215 | between them. Two curators independently judged each article, and discrepancies were discussed until agreement. The
216 | curators were not blinded to the retrieval method but were
217 | required to record the text supporting the relationship, if relevant. This experiment evaluated the relevance of the top 20
218 | results for each retrieval method, regardless of whether the
219 | article appeared in PubMed.
220 |
221 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
222 |
223 | Figure 2. (A) The PubTator 3.0 processing pipeline: AIONER (8) identifies six types of entities in PubMed abstracts and PMC-OA full-text articles. Entity
224 | annotations are associated with database identifiers by specialized mappers and BioREx (9) identifies relations between entities. Extracted data is
225 | stored in MongoDB and made searchable using Solr. (B) Entity recognition performance for each entity type compared with PubTator2 (also known as
226 | PubTatorCentral) (13) on the BioRED corpus (15). (C) Relation extraction performance compared with SemRep (11) and notable previous best systems
227 | (12,13) on the BioCreative V Chemical-Disease Relation (14) corpus. (D) Comparison of information retrieval for PubTator 3.0, PubMed, and Google
228 | Scholar for entity pair queries, with respect to total article count and top-20 article precision.
229 |
230 | W544
231 |
232 | Case study II: retrieval-augmented generation
233 | In the era of large language models (LLMs), PubTator 3.0 can
234 | also enhance their factual accuracy via retrieval augmented
235 | generation. Despite their strong language ability, LLMs are
236 | prone to generating incorrect assertions, sometimes known
237 | as hallucinations (30,31). For example, when requested to
238 | cite sources for questions such as ‘which diseases can doxorubicin treat’, GPT-4 frequently provides seemingly plausible but nonexistent references. Augmenting GPT-4 with PubTator 3.0 APIs can anchor the model’s response to verifiable
239 | references via the extracted relations, significantly reducing
240 | hallucinations.
241 | We assessed the citation accuracy of responses from three
242 | GPT-4 variations: PubTator-augmented GPT-4, PubMedaugmented GPT-4 and standard GPT-4. We performed a qualitative evaluation based on eight questions selected as follows. We identified entities mentioned in the PubMed query
243 | logs and randomly selected from entities searched both frequently and rarely. We then identified the common queries for
244 | each entity that request relational information and adapted
245 | one into a natural language question. Each question is therefore grounded on common information needs of real PubMed
246 | users. For example, the questions ‘What can be caused by
247 | tocilizumab?’ and ‘What can be treated by doxorubicin?’
248 | are adapted from the user queries ‘tocilizumab side effects’
249 | and ‘doxorubicin treatment’ respectively. Such questions typically require extracting information from multiple articles
250 | and an understanding of biomedical entities and relationship descriptions. Supplementary Table S5 lists the questions
251 | chosen.
252 | We augmented the GPT-4 large language model (LLM) with
253 | PubTator 3.0 via the function calling mechanism of the OpenAI ChatCompletion API. This integration involved prompt-
254 |
255 | ing GPT-4 with descriptions of three PubTator APIs: (i) find
256 | entity ID, which retrieves PubTator entity identifiers; (ii) find
257 | related entities, which identifies related entities based on an
258 | input entity and specified relations and (iii) export relevant
259 | search results, which returns PubMed article identifiers containing textual evidence for specific entity relationships. Our
260 | instructions prompted GPT-4 to decompose user questions
261 | into sub-questions addressable by these APIs, execute the
262 | function calls, and synthesize the responses into a coherent final answer. Our prompt promoted a summarized response by
263 | instructing GPT-4 to start its message with ‘Summary:’ and requested the response include citations to the articles providing
264 | evidence. The PubMed augmentation experiments provided
265 | GPT-4 with access to PubMed database search via the National Center for Biotechnology Information (NCBI) E-utils
266 | APIs (32). We used Azure OpenAI Services (version 2023-0701-preview) and GPT-4 (version 2023-06-13) and set the decoding temperature to zero to obtain deterministic outputs.
267 | The full prompts are provided in Supplementary Table S6.
268 | PubTator-augmented GPT-4 generally processed the questions in three steps: (i) finding the standard entity identifiers, (ii) finding its related entity identifiers and (iii) searching PubMed articles. For example, to answer ‘What drugs can
269 | treat breast cancer?’, GPT-4 first found the PubTator entity
270 | identifier for breast cancer (@DISEASE_Breast_Cancer) using
271 | the Find Entity ID API. It then used the Find Related Entities
272 | API to identify entities related to @DISEASE_Breast_Cancer
273 | through a ‘treat’ relation. For demonstration purposes, we
274 | limited the maximum number of output entities to five. Finally,
275 | GPT-4 called the Export Relevant Search Results API for the
276 | PubMed article identifiers containing evidence for these relationships. The raw responses to each prompt for each method
277 | are provided in Supplementary Table S6.
278 | We manually evaluated the accuracy of the citations in
279 | the responses by reviewing each PubMed article and verifying whether each PubMed article cited supported the
280 | stated relationship (e.g. Tamoxifen treating breast cancer).
281 | Supplementary Table S5 reports the proportion of the cited
282 | articles with valid supporting evidence for each method. GPT4 frequently generated fabricated citations, widely known
283 | as the hallucination issue. While PubMed-augmented GPT-4
284 | showed a higher proportion of accurate citations, some articles cited did not support the relation claims. This is likely
285 | because PubMed is based on keyword and Boolean search and
286 | does not support queries for specific relationships. Responses
287 | generated by PubTator-augmented GPT-4 demonstrated the
288 | highest level of citation accuracy, underscoring the potential of PubTator 3.0 as a high-quality knowledge source for
289 | addressing biomedical information needs through retrievalaugmented generation with LLMs such as GPT-4. In our experiment, using Azure for ChatGPT, the cost was approximately $1 for two questions with GPT-4-Turbo, or 40 questions when downgraded to GPT-3.5-Turbo, including the cost
290 | of input/output tokens.
291 |
292 | Discussion
293 | Previous versions of PubTator have fulfilled over one billion
294 | API requests since 2015, supporting a wide range of research
295 | applications. Numerous studies have harnessed PubTator annotations for disease-specific gene research, including efforts
296 | to prioritize candidate genes (33), determine gene–phenotype
297 | associations (34), and identify the genetic underpinnings of
298 |
299 | Downloaded from https://academic.oup.com/nar/article/52/W1/W540/7640526 by guest on 10 March 2025
300 |
301 | Our analysis is summarized in Figure 2D, and
302 | Supplementary Table S4 presents a detailed comparison
303 | of the quality of retrieved results between PubTator 3.0,
304 | PubMed and Google Scholar. Our results demonstrate that
305 | PubTator 3.0 retrieves a greater number of articles than the
306 | comparison systems and its precision is higher for the top
307 | 20 results. For instance, PubTator 3.0 returned 346 articles
308 | for the query ‘GLPG0634 + ulcerative colitis’, and manual
309 | review of the top 20 articles showed that all contained
310 | statements about an association between GLPG0634 and
311 | ulcerative colitis. In contrast, PubMed only returned a total
312 | of 18 articles, with only 12 mentioning an association. Moreover, when searching for ‘COVID19 + PON1’, PubTator 3.0
313 | returns 212 articles in PubMed, surpassing the 43 articles
314 | obtained from Google Scholar, only 29 of which are sourced
315 | from PubMed. These disparities can be attributed to several
316 | factors: (i) PubTator 3.0’s search includes full texts available
317 | in PMC-OA, resulting in significantly broader coverage of
318 | articles, (ii) entity normalization improves recall, for example,
319 | by matching ‘paraoxonase 1’ to ‘PON1’, (iii) PubTator 3.0
320 | prioritizes articles containing relations between the query
321 | entities, (iv) Pubtator 3.0 prioritizes articles where the entities
322 | appear nearby, rather than distant paragraphs. Across the 12
323 | information retrieval case studies, PubTator 3.0 demonstrated
324 | an overall precision of 90.0% for the top 20 articles (216 out
325 | of 240), which is significantly higher than PubMed’s precision
326 | of 81.6% (84 out of 103) and Google Scholar’s precision of
327 | 48.5% (98 out of 202).
328 |
329 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
330 |
331 | W545
332 |
333 | Nucleic Acids Research, 2024, Vol. 52, Web Server issue
334 |
335 | Conclusion
336 | PubTator 3.0 offers a comprehensive set of features and tools
337 | that allow researchers to navigate the ever-expanding wealth
338 | of biomedical literature, expediting research and unlocking
339 | valuable insights for scientific discovery. The PubTator 3.0 interface, API, and bulk file downloads are available at https:
340 | //www.ncbi.nlm.nih.gov/research/pubtator3/.
341 |
342 | Data availability
343 | Data is available through the online interface at https://
344 | www.ncbi.nlm.nih.gov/research/pubtator3/, through the API
345 | at https://www.ncbi.nlm.nih.gov/research/pubtator3/api or
346 | bulk FTP download at https://ftp.ncbi.nlm.nih.gov/pub/lu/
347 | PubTator3/.
348 | The source code for each component of PubTator 3.0
349 | is openly accessible. The AIONER named entity recognizer
350 | is available at https://github.com/ncbi/AIONER. GNorm2,
351 | for gene name normalization, is available at https://github.
352 | com/ncbi/GNorm2. The tmVar3 variant name normalizer
353 | is available at https://github.com/ncbi/tmVar3. The NLMChem Tagger, for chemical name normalization, is available
354 | at https://ftp.ncbi.nlm.nih.gov/pub/lu/NLMChem. The TaggerOne system, for disease and cell line normalization, is available at https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/
355 | taggerone. The BioREx relation extraction system is available
356 | at https://github.com/ncbi/BioREx. The code for customizing
357 | ChatGPT with the PubTator 3.0 API is available at https:
358 | //github.com/ncbi-nlp/pubtator-gpt. The details of the applications, performance, evaluation data, and citations for each
359 | tool are shown in Supplementary Table S7. All source code is
360 | also available at https://doi.org/10.5281/zenodo.10839630.
361 |
362 | Supplementary data
363 | Supplementary Data are available at NAR Online.
364 |
365 | Funding
366 | Intramural Research Program of the National Library of
367 | Medicine (NLM), National Institutes of Health; ODSS Support of the Exploration of Cloud in NIH Intramural Research.
368 | Funding for open access charge: Intramural Research Program
369 | of the National Library of Medicine, National Institutes of
370 | Health.
371 |
372 | Conflict of interest statement
373 | None declared.
374 |
```
--------------------------------------------------------------------------------
/tests/data/ct_gov/clinical_trials_api_v2.yaml:
--------------------------------------------------------------------------------
```yaml
1 | openapi: "3.0.3"
2 | info:
3 | title: "ClinicalTrials.gov REST API"
4 | description:
5 | "This API is made available to provide users meta data, statistics,\
6 | \ and the most recent version of the clinical trials available on ClinicalTrials.gov."
7 | version: "2.0.3"
8 | tags:
9 | - name: "Studies"
10 | description: "Related to clinical trial studies"
11 | - name: "Stats"
12 | description: "Data statistics"
13 | - name: "Version"
14 | description: "Version info"
15 | servers:
16 | - url: "https://clinicaltrials.gov/api/v2"
17 | description: "This server"
18 | paths:
19 | /studies:
20 | get:
21 | summary: "Studies"
22 | description:
23 | "Returns data of studies matching query and filter parameters.\
24 | \ The studies are returned page by page.\nIf response contains `nextPageToken`,\
25 | \ use its value in `pageToken` to get next page.\nThe last page will not contain\
26 | \ `nextPageToken`. A page may have empty `studies` array.\nRequest for each\
27 | \ subsequent page **must** have the same parameters as for the first page,\
28 | \ except\n`countTotal`, `pageSize`, and `pageToken` parameters.\n\nIf neither\
29 | \ queries nor filters are set, all studies will be returned.\nIf any query\
30 | \ parameter contains only NCT IDs (comma- and/or space-separated), filters\
31 | \ are ignored.\n\n`query.*` parameters are in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\n\
32 | Those parameters affect ranking of studies, if sorted by relevance. See `sort`\
33 | \ parameter for details.\n\n`filter.*` and `postFilter.*` parameters have\
34 | \ same effect as there is no aggregation calculation. \nBoth are available\
35 | \ just to simplify applying parameters from search request.\nBoth do not affect\
36 | \ ranking of studies.\n\nNote: When trying JSON format in your browser, do\
37 | \ not set too large `pageSize` parameter, if `fields` is\nunlimited. That\
38 | \ may return too much data for the browser to parse and render."
39 | tags:
40 | - "Studies"
41 | operationId: "listStudies"
42 | parameters:
43 | - name: "format"
44 | in: "query"
45 | description:
46 | "Must be one of the following:\n* `csv`- return CSV table with\
47 | \ one page of study data; first page will contain header with column names;\
48 | \ available fields are listed on [CSV Download](/data-api/about-api/csv-download)\
49 | \ page\n* `json`- return JSON with one page of study data; every study object\
50 | \ is placed in a separate line; `markup` type fields format depends on `markupFormat`\
51 | \ parameter"
52 | required: false
53 | schema:
54 | type: "string"
55 | enum:
56 | - "csv"
57 | - "json"
58 | default: "json"
59 | - name: "markupFormat"
60 | in: "query"
61 | description:
62 | "Format of `markup` type fields:\n* `markdown`- [markdown](https://spec.commonmark.org/0.28/)\
63 | \ format\n* `legacy`- compatible with classic PRS\n\nApplicable only to\
64 | \ `json` format."
65 | required: false
66 | schema:
67 | type: "string"
68 | enum:
69 | - "markdown"
70 | - "legacy"
71 | default: "markdown"
72 | - name: "query.cond"
73 | in: "query"
74 | description:
75 | "\"Conditions or disease\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
76 | \ See \"ConditionSearch Area\" on [Search Areas](/data-api/about-api/search-areas#ConditionSearch)\
77 | \ for more details."
78 | required: false
79 | schema:
80 | type: "string"
81 | examples:
82 | example1:
83 | value: "lung cancer"
84 | example2:
85 | value: "(head OR neck) AND pain"
86 | - name: "query.term"
87 | in: "query"
88 | description:
89 | "\"Other terms\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
90 | \ See \"BasicSearch Area\" on [Search Areas](/data-api/about-api/search-areas#BasicSearch)\
91 | \ for more details."
92 | required: false
93 | schema:
94 | type: "string"
95 | examples:
96 | example1:
97 | value: "AREA[LastUpdatePostDate]RANGE[2023-01-15,MAX]"
98 | - name: "query.locn"
99 | in: "query"
100 | description:
101 | "\"Location terms\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
102 | \ See \"LocationSearch Area\" on [Search Areas](/data-api/about-api/search-areas#LocationSearch)\
103 | \ for more details."
104 | required: false
105 | schema:
106 | type: "string"
107 | - name: "query.titles"
108 | in: "query"
109 | description:
110 | "\"Title / acronym\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
111 | \ See \"TitleSearch Area\" on [Search Areas](/data-api/about-api/search-areas#TitleSearch)\
112 | \ for more details."
113 | required: false
114 | schema:
115 | type: "string"
116 | - name: "query.intr"
117 | in: "query"
118 | description:
119 | "\"Intervention / treatment\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
120 | \ See \"InterventionSearch Area\" on [Search Areas](/data-api/about-api/search-areas#InterventionSearch)\
121 | \ for more details."
122 | required: false
123 | schema:
124 | type: "string"
125 | - name: "query.outc"
126 | in: "query"
127 | description:
128 | "\"Outcome measure\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
129 | \ See \"OutcomeSearch Area\" on [Search Areas](/data-api/about-api/search-areas#OutcomeSearch)\
130 | \ for more details."
131 | required: false
132 | schema:
133 | type: "string"
134 | - name: "query.spons"
135 | in: "query"
136 | description:
137 | "\"Sponsor / collaborator\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
138 | \ See \"SponsorSearch Area\" on [Search Areas](/data-api/about-api/search-areas#SponsorSearch)\
139 | \ for more details."
140 | required: false
141 | schema:
142 | type: "string"
143 | - name: "query.lead"
144 | in: "query"
145 | description:
146 | "Searches in \"LeadSponsorName\" field. See [Study Data Structure](/data-api/about-api/study-data-structure#LeadSponsorName)\
147 | \ for more details. The query is in [Essie expression syntax](/find-studies/constructing-complex-search-queries)."
148 | required: false
149 | schema:
150 | type: "string"
151 | - name: "query.id"
152 | in: "query"
153 | description:
154 | "\"Study IDs\" query in [Essie expression syntax](/find-studies/constructing-complex-search-queries).\
155 | \ See \"IdSearch Area\" on [Search Areas](/data-api/about-api/search-areas#IdSearch)\
156 | \ for more details."
157 | required: false
158 | schema:
159 | type: "string"
160 | - name: "query.patient"
161 | in: "query"
162 | description:
163 | "See \"PatientSearch Area\" on [Search Areas](/data-api/about-api/search-areas#PatientSearch)\
164 | \ for more details."
165 | required: false
166 | schema:
167 | type: "string"
168 | - name: "filter.overallStatus"
169 | in: "query"
170 | style: "pipeDelimited"
171 | explode: false
172 | description: "Filter by comma- or pipe-separated list of statuses"
173 | required: false
174 | schema:
175 | type: "array"
176 | items:
177 | $ref: "#/components/schemas/Status"
178 | examples:
179 | example1:
180 | value:
181 | - "NOT_YET_RECRUITING"
182 | - "RECRUITING"
183 | example2:
184 | value:
185 | - "COMPLETED"
186 | - name: "filter.geo"
187 | in: "query"
188 | description:
189 | "Filter by geo-function. Currently only distance function is\
190 | \ supported.\nFormat: `distance(latitude,longitude,distance)`"
191 | required: false
192 | schema:
193 | type: "string"
194 | pattern:
195 | "^distance\\(-?\\d+(\\.\\d+)?,-?\\d+(\\.\\d+)?,\\d+(\\.\\d+)?(km|mi)?\\\
196 | )$"
197 | examples:
198 | example1:
199 | value: "distance(39.0035707,-77.1013313,50mi)"
200 | - name: "filter.ids"
201 | in: "query"
202 | style: "pipeDelimited"
203 | explode: false
204 | description:
205 | "Filter by comma- or pipe-separated list of NCT IDs (a.k.a. ClinicalTrials.gov\
206 | \ identifiers).\nThe provided IDs will be searched in [NCTId](data-api/about-api/study-data-structure#NCTId)\
207 | \ and\n[NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\
208 | \ fields."
209 | required: false
210 | schema:
211 | type: "array"
212 | items:
213 | type: "string"
214 | pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$"
215 | examples:
216 | example1:
217 | value:
218 | - "NCT04852770"
219 | - "NCT01728545"
220 | - "NCT02109302"
221 | - name: "filter.advanced"
222 | in: "query"
223 | description: "Filter by query in [Essie expression syntax](/find-studies/constructing-complex-search-queries)"
224 | required: false
225 | schema:
226 | type: "string"
227 | examples:
228 | example1:
229 | value: "AREA[StartDate]2022"
230 | example2:
231 | value:
232 | "AREA[MinimumAge]RANGE[MIN, 16 years] AND AREA[MaximumAge]RANGE[16\
233 | \ years, MAX]"
234 | - name: "filter.synonyms"
235 | in: "query"
236 | style: "pipeDelimited"
237 | explode: false
238 | description:
239 | "Filter by comma- or pipe-separated list of `area`:`synonym_id`\
240 | \ pairs"
241 | required: false
242 | schema:
243 | type: "array"
244 | items:
245 | type: "string"
246 | examples:
247 | example1:
248 | value:
249 | - "ConditionSearch:1651367"
250 | - "BasicSearch:2013558"
251 | - name: "postFilter.overallStatus"
252 | in: "query"
253 | style: "pipeDelimited"
254 | explode: false
255 | description: "Filter by comma- or pipe-separated list of statuses"
256 | required: false
257 | schema:
258 | type: "array"
259 | items:
260 | $ref: "#/components/schemas/Status"
261 | examples:
262 | example1:
263 | value:
264 | - "NOT_YET_RECRUITING"
265 | - "RECRUITING"
266 | example2:
267 | value:
268 | - "COMPLETED"
269 | - name: "postFilter.geo"
270 | in: "query"
271 | description:
272 | "Filter by geo-function. Currently only distance function is\
273 | \ supported.\nFormat: `distance(latitude,longitude,distance)`"
274 | required: false
275 | schema:
276 | type: "string"
277 | pattern:
278 | "^distance\\(-?\\d+(\\.\\d+)?,-?\\d+(\\.\\d+)?,\\d+(\\.\\d+)?(km|mi)?\\\
279 | )$"
280 | examples:
281 | example1:
282 | value: "distance(39.0035707,-77.1013313,50mi)"
283 | - name: "postFilter.ids"
284 | in: "query"
285 | style: "pipeDelimited"
286 | explode: false
287 | description:
288 | "Filter by comma- or pipe-separated list of NCT IDs (a.k.a. ClinicalTrials.gov\
289 | \ identifiers).\nThe provided IDs will be searched in [NCTId](data-api/about-api/study-data-structure#NCTId)\
290 | \ and\n[NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\
291 | \ fields."
292 | required: false
293 | schema:
294 | type: "array"
295 | items:
296 | type: "string"
297 | pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$"
298 | examples:
299 | example1:
300 | value:
301 | - "NCT04852770"
302 | - "NCT01728545"
303 | - "NCT02109302"
304 | - name: "postFilter.advanced"
305 | in: "query"
306 | description: "Filter by query in [Essie expression syntax](/find-studies/constructing-complex-search-queries)"
307 | required: false
308 | schema:
309 | type: "string"
310 | examples:
311 | example1:
312 | value: "AREA[StartDate]2022"
313 | example2:
314 | value:
315 | "AREA[MinimumAge]RANGE[MIN, 16 years] AND AREA[MaximumAge]RANGE[16\
316 | \ years, MAX]"
317 | - name: "postFilter.synonyms"
318 | in: "query"
319 | style: "pipeDelimited"
320 | explode: false
321 | description:
322 | "Filter by comma- or pipe-separated list of `area`:`synonym_id`\
323 | \ pairs"
324 | required: false
325 | schema:
326 | type: "array"
327 | items:
328 | type: "string"
329 | examples:
330 | example1:
331 | value:
332 | - "ConditionSearch:1651367"
333 | - "BasicSearch:2013558"
334 | - name: "aggFilters"
335 | in: "query"
336 | description:
337 | "Apply aggregation filters, aggregation counts will not be provided.\n\
338 | The value is comma- or pipe-separated list of pairs `filter_id`:`space-separated\
339 | \ list of option keys` for the checked options."
340 | required: false
341 | schema:
342 | type: "string"
343 | examples:
344 | example1:
345 | value: "results:with,status:com"
346 | example2:
347 | value: "status:not rec,sex:f,healthy:y"
348 | - name: "geoDecay"
349 | in: "query"
350 | description:
351 | "Set proximity factor by distance from `filter.geo` location\
352 | \ to the closest [LocationGeoPoint](/data-api/about-api/study-data-structure#LocationGeoPoint)\
353 | \ of a study.\nIgnored, if `filter.geo` parameter is not set or response\
354 | \ contains more than 10,000 studies."
355 | required: false
356 | schema:
357 | type: "string"
358 | pattern:
359 | "^func:(gauss|exp|linear),scale:(\\d+(\\.\\d+)?(km|mi)),offset:(\\\
360 | d+(\\.\\d+)?(km|mi)),decay:(\\d+(\\.\\d+)?)$"
361 | default: "func:exp,scale:300mi,offset:0mi,decay:0.5"
362 | examples:
363 | example1:
364 | value: "func:linear,scale:100km,offset:10km,decay:0.1"
365 | example2:
366 | value: "func:gauss,scale:500mi,offset:0mi,decay:0.3"
367 | - name: "fields"
368 | in: "query"
369 | style: "pipeDelimited"
370 | explode: false
371 | description:
372 | "If specified, must be non-empty comma- or pipe-separated list\
373 | \ of fields to return. If unspecified, all fields will be returned.\nOrder\
374 | \ of the fields does not matter.\n\nFor `csv` format, specify list of columns.\
375 | \ The column names are available on [CSV Download](/data-api/about-api/csv-download).\n\
376 | \nFor `json` format, every list item is either area name, piece name, field\
377 | \ name, or special name.\nIf a piece or a field is a branch node, all descendant\
378 | \ fields will be included.\nAll area names are available on [Search Areas](/data-api/about-api/search-areas),\n\
379 | the piece and field names — on [Data Structure](/data-api/about-api/study-data-structure)\
380 | \ and also can be retrieved at `/studies/metadata` endpoint.\nThere is a\
381 | \ special name, `@query`, which expands to all fields queried by search."
382 | required: false
383 | schema:
384 | type: "array"
385 | minItems: 1
386 | items:
387 | type: "string"
388 | pattern: "^([a-zA-Z][a-zA-Z0-9\\-. ]*)|(@query)$"
389 | examples:
390 | example1:
391 | value:
392 | - "NCTId"
393 | - "BriefTitle"
394 | - "OverallStatus"
395 | - "HasResults"
396 | example2:
397 | value: "ProtocolSection"
398 | - name: "sort"
399 | in: "query"
400 | style: "pipeDelimited"
401 | explode: false
402 | description:
403 | "Comma- or pipe-separated list of sorting options of the studies.\
404 | \ The returning studies are not sorted by default for a performance reason.\n\
405 | Every list item contains a field/piece name and an optional sort direction\
406 | \ (`asc` for ascending or `desc` for descending)\nafter colon character.\n\
407 | \nAll piece and field names can be found on [Data Structure](/data-api/about-api/study-data-structure)\
408 | \ and also can be retrieved\nat `/studies/metadata` endpoint. Currently,\
409 | \ only date and numeric fields are allowed for sorting.\nThere is a special\
410 | \ \"field\" `@relevance` to sort by relevance to a search query.\n\nStudies\
411 | \ missing sort field are always last. Default sort direction:\n* Date field\
412 | \ - `desc`\n* Numeric field - `asc`\n* `@relevance` - `desc`"
413 | required: false
414 | schema:
415 | type: "array"
416 | maxItems: 2
417 | default: []
418 | items:
419 | type: "string"
420 | pattern: "^(([a-zA-Z][a-zA-Z0-9\\-. ]*)|(@relevance))(:(asc|desc))?$"
421 | examples:
422 | example1:
423 | value:
424 | - "@relevance"
425 | example2:
426 | value:
427 | - "LastUpdatePostDate"
428 | example3:
429 | value:
430 | - "EnrollmentCount:desc"
431 | - "NumArmGroups"
432 | - name: "countTotal"
433 | in: "query"
434 | description:
435 | "Count total number of studies in all pages and return `totalCount`\
436 | \ field with first page, if `true`.\nFor CSV, the result can be found in\
437 | \ `x-total-count` response header.\nThe parameter is ignored for the subsequent\
438 | \ pages."
439 | required: false
440 | schema:
441 | type: "boolean"
442 | default: false
443 | - name: "pageSize"
444 | in: "query"
445 | description:
446 | "Page size is maximum number of studies to return in response.\
447 | \ It does not have to be the same for every page.\nIf not specified or set\
448 | \ to 0, the default value will be used. It will be coerced down to 1,000,\
449 | \ if greater than that."
450 | required: false
451 | schema:
452 | type: "integer"
453 | format: "int32"
454 | minimum: 0
455 | default: 10
456 | examples:
457 | example1:
458 | value: 2
459 | example2:
460 | value: 100
461 | - name: "pageToken"
462 | in: "query"
463 | description:
464 | "Token to get next page. Set it to a `nextPageToken` value returned\
465 | \ with the previous page in JSON format.\nFor CSV, it can be found in `x-next-page-token`\
466 | \ response header.\nDo not specify it for first page."
467 | required: false
468 | schema:
469 | type: "string"
470 | responses:
471 | "200":
472 | description: "OK"
473 | content:
474 | application/json:
475 | schema:
476 | $ref: "#/components/schemas/PagedStudies"
477 | example:
478 | totalCount: 438897
479 | studies:
480 | - protocolSection:
481 | identificationModule:
482 | nctId: "NCT03540771"
483 | briefTitle:
484 | "Introducing Palliative Care (PC) Within the Treatment\
485 | \ of End Stage Liver Disease (ESLD)"
486 | statusModule:
487 | overallStatus: "RECRUITING"
488 | hasResults: false
489 | - protocolSection:
490 | identificationModule:
491 | nctId: "NCT03630471"
492 | briefTitle:
493 | "Effectiveness of a Problem-solving Intervention\
494 | \ for Common Adolescent Mental Health Problems in India"
495 | statusModule:
496 | overallStatus: "COMPLETED"
497 | hasResults: false
498 | - protocolSection:
499 | identificationModule:
500 | nctId: "NCT00587795"
501 | briefTitle:
502 | "Orthopedic Study of the Aircast StabilAir Wrist\
503 | \ Fracture Brace"
504 | statusModule:
505 | overallStatus: "TERMINATED"
506 | hasResults: true
507 | nextPageToken: "abracadabra"
508 | "400":
509 | description: "Bad Request"
510 | content:
511 | text/plain:
512 | schema:
513 | $ref: "#/components/schemas/errorMessage"
514 | /studies/{nctId}:
515 | get:
516 | summary: "Single Study"
517 | description: "Returns data of a single study."
518 | tags:
519 | - "Studies"
520 | operationId: "fetchStudy"
521 | parameters:
522 | - name: "nctId"
523 | in: "path"
524 | description:
525 | "NCT Number of a study. If found in [NCTIdAlias](data-api/about-api/study-data-structure#NCTIdAlias)\
526 | \ field,\n301 HTTP redirect to the actual study will be returned."
527 | required: true
528 | schema:
529 | type: "string"
530 | pattern: "^[Nn][Cc][Tt]0*[1-9]\\d{0,7}$"
531 | examples:
532 | example1:
533 | value: "NCT00841061"
534 | example2:
535 | value: "NCT04000165"
536 | - name: "format"
537 | in: "query"
538 | description:
539 | "Must be one of the following:\n* `csv`- return CSV table; available\
540 | \ fields are listed on [CSV Download](/data-api/about-api/csv-download)\n\
541 | * `json`- return JSON object; format of `markup` fields depends on `markupFormat`\
542 | \ parameter\n* `json.zip`- put JSON object into a .json file and download\
543 | \ it as zip archive; field values of type `markup` are in [markdown](https://spec.commonmark.org/0.28/)\
544 | \ format\n* `fhir.json` - return FHIR JSON; fields are not customizable;\
545 | \ see [Access Data in FHIR](/data-api/fhir)\n* `ris`- return RIS record;\
546 | \ available tags are listed on [RIS Download](/data-api/about-api/ris-download)"
547 | required: false
548 | schema:
549 | type: "string"
550 | enum:
551 | - "csv"
552 | - "json"
553 | - "json.zip"
554 | - "fhir.json"
555 | - "ris"
556 | default: "json"
557 | - name: "markupFormat"
558 | in: "query"
559 | description:
560 | "Format of `markup` type fields:\n* `markdown`- [markdown](https://spec.commonmark.org/0.28/)\
561 | \ format\n* `legacy`- compatible with classic PRS\n\nApplicable only to\
562 | \ `json` format."
563 | required: false
564 | schema:
565 | type: "string"
566 | enum:
567 | - "markdown"
568 | - "legacy"
569 | default: "markdown"
570 | - name: "fields"
571 | in: "query"
572 | style: "pipeDelimited"
573 | explode: false
574 | description:
575 | "If specified, must be non-empty comma- or pipe-separated list\
576 | \ of fields to return. If unspecified, all fields will be returned.\nOrder\
577 | \ of the fields does not matter.\n\nFor `csv` format, specify list of columns.\
578 | \ The column names are available on [CSV Download](/data-api/about-api/csv-download).\n\
579 | \nFor `json` and `json.zip` formats, every list item is either area name,\
580 | \ piece name, or field name.\nIf a piece or a field is a branch node, all\
581 | \ descendant fields will be included.\nAll area names are available on [Search\
582 | \ Areas](/data-api/about-api/search-areas),\nthe piece and field names -\
583 | \ on [Data Structure](/data-api/about-api/study-data-structure) and also\
584 | \ can be retrieved at `/studies/metadata` endpoint.\n\nFor `fhir.json` format,\
585 | \ all available fields are returned and this parameter must be unspecified.\n\
586 | \nFor `ris` format, specify list of tags. The tag names are available on\
587 | \ [RIS Download](/data-api/about-api/ris-download)."
588 | required: false
589 | schema:
590 | type: "array"
591 | minItems: 1
592 | items:
593 | type: "string"
594 | pattern: "^[a-zA-Z][a-zA-Z0-9\\-. ]*$"
595 | examples:
596 | example1:
597 | value:
598 | - "NCTId"
599 | - "BriefTitle"
600 | - "Reference"
601 | example2:
602 | value:
603 | - "ConditionsModule"
604 | - "EligibilityModule"
605 | responses:
606 | "200":
607 | description: "OK"
608 | content:
609 | text/csv:
610 | schema:
611 | $ref: "#/components/schemas/StudiesCsv"
612 | application/json:
613 | schema:
614 | $ref: "#/components/schemas/Study"
615 | application/zip:
616 | schema:
617 | $ref: "#/components/schemas/StudiesZip"
618 | application/fhir+json:
619 | schema:
620 | $ref: "#/components/schemas/StudyFhir"
621 | "301":
622 | description: "Moved Permanently"
623 | content: {}
624 | "400":
625 | description: "Bad Request"
626 | content:
627 | text/plain:
628 | schema:
629 | $ref: "#/components/schemas/errorMessage"
630 | "404":
631 | description: "Not Found"
632 | content:
633 | text/plain:
634 | schema:
635 | $ref: "#/components/schemas/errorMessage"
636 | /studies/metadata:
637 | get:
638 | summary: "Data Model Fields"
639 | description: "Returns study data model fields."
640 | tags:
641 | - "Studies"
642 | operationId: "studiesMetadata"
643 | parameters:
644 | - name: "includeIndexedOnly"
645 | in: "query"
646 | description: "Include indexed-only fields, if `true`"
647 | required: false
648 | schema:
649 | type: "boolean"
650 | default: false
651 | - name: "includeHistoricOnly"
652 | in: "query"
653 | description: "Include fields available only in historic data, if `true`"
654 | required: false
655 | schema:
656 | type: "boolean"
657 | default: false
658 | responses:
659 | "200":
660 | description: "OK"
661 | content:
662 | application/json:
663 | schema:
664 | $ref: "#/components/schemas/FieldNodeList"
665 | "400":
666 | description: "Bad Request"
667 | content:
668 | text/plain:
669 | schema:
670 | $ref: "#/components/schemas/errorMessage"
671 |
```
--------------------------------------------------------------------------------
/src/biomcp/variants/external.py:
--------------------------------------------------------------------------------
```python
1 | """External data sources for enhanced variant annotations."""
2 |
3 | import asyncio
4 | import json
5 | import logging
6 | import re
7 | from typing import Any
8 | from urllib.parse import quote
9 |
10 | from pydantic import BaseModel, Field
11 |
12 | from .. import http_client
13 |
14 | # Import CBioPortalVariantData from the new module
15 | from .cbio_external_client import CBioPortalVariantData
16 | from .oncokb_client import OncoKBClient
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 | # TCGA/GDC API endpoints
21 | GDC_BASE = "https://api.gdc.cancer.gov"
22 | GDC_SSMS_ENDPOINT = f"{GDC_BASE}/ssms" # Simple Somatic Mutations
23 |
24 | # 1000 Genomes API endpoints
25 | ENSEMBL_REST_BASE = "https://rest.ensembl.org"
26 | ENSEMBL_VARIATION_ENDPOINT = f"{ENSEMBL_REST_BASE}/variation/human"
27 |
28 | # Import constants
29 |
30 |
31 | class TCGAVariantData(BaseModel):
32 | """TCGA/GDC variant annotation data."""
33 |
34 | cosmic_id: str | None = None
35 | tumor_types: list[str] = Field(default_factory=list)
36 | mutation_frequency: float | None = None
37 | mutation_count: int | None = None
38 | affected_cases: int | None = None
39 | consequence_type: str | None = None
40 | clinical_significance: str | None = None
41 |
42 |
43 | class ThousandGenomesData(BaseModel):
44 | """1000 Genomes variant annotation data."""
45 |
46 | global_maf: float | None = Field(
47 | None, description="Global minor allele frequency"
48 | )
49 | afr_maf: float | None = Field(None, description="African population MAF")
50 | amr_maf: float | None = Field(None, description="American population MAF")
51 | eas_maf: float | None = Field(
52 | None, description="East Asian population MAF"
53 | )
54 | eur_maf: float | None = Field(None, description="European population MAF")
55 | sas_maf: float | None = Field(
56 | None, description="South Asian population MAF"
57 | )
58 | ancestral_allele: str | None = None
59 | most_severe_consequence: str | None = None
60 |
61 |
62 | # CBioPortalVariantData is now imported from cbio_external_client.py
63 |
64 |
65 | class OncoKBVariantData(BaseModel):
66 | """OncoKB variant annotation data."""
67 |
68 | oncogenic: str | None = None
69 | mutation_effect: str | None = None
70 | highest_sensitive_level: str | None = None
71 | highest_resistance_level: str | None = None
72 | treatments_count: int = 0
73 | diagnostic_implications_count: int = 0
74 | prognostic_implications_count: int = 0
75 | is_hotspot: bool = False
76 |
77 |
78 | class EnhancedVariantAnnotation(BaseModel):
79 | """Enhanced variant annotation combining multiple sources."""
80 |
81 | variant_id: str
82 | tcga: TCGAVariantData | None = None
83 | thousand_genomes: ThousandGenomesData | None = None
84 | cbioportal: CBioPortalVariantData | None = None
85 | oncokb: OncoKBVariantData | None = None
86 | error_sources: list[str] = Field(default_factory=list)
87 |
88 |
89 | class TCGAClient:
90 | """Client for TCGA/GDC API."""
91 |
92 | async def get_variant_data(
93 | self, variant_id: str
94 | ) -> TCGAVariantData | None:
95 | """Fetch variant data from TCGA/GDC.
96 |
97 | Args:
98 | variant_id: Can be gene AA change (e.g., "BRAF V600E") or genomic coordinates
99 | """
100 | try:
101 | # Determine the search field based on variant_id format
102 | # If it looks like "GENE AA_CHANGE" format, use gene_aa_change field
103 | if " " in variant_id and not variant_id.startswith("chr"):
104 | search_field = "gene_aa_change"
105 | search_value = variant_id
106 | else:
107 | # Otherwise try genomic_dna_change
108 | search_field = "genomic_dna_change"
109 | search_value = variant_id
110 |
111 | # First, search for the variant
112 | params = {
113 | "filters": json.dumps({
114 | "op": "in",
115 | "content": {
116 | "field": search_field,
117 | "value": [search_value],
118 | },
119 | }),
120 | "fields": "cosmic_id,genomic_dna_change,gene_aa_change,ssm_id",
121 | "format": "json",
122 | "size": "5", # Get a few in case of multiple matches
123 | }
124 |
125 | response, error = await http_client.request_api(
126 | url=GDC_SSMS_ENDPOINT,
127 | method="GET",
128 | request=params,
129 | domain="gdc",
130 | )
131 |
132 | if error or not response:
133 | return None
134 |
135 | data = response.get("data", {})
136 | hits = data.get("hits", [])
137 |
138 | if not hits:
139 | return None
140 |
141 | # Get the first hit
142 | hit = hits[0]
143 | ssm_id = hit.get("ssm_id")
144 | cosmic_id = hit.get("cosmic_id")
145 |
146 | # For gene_aa_change searches, verify we have the right variant
147 | if search_field == "gene_aa_change":
148 | gene_aa_changes = hit.get("gene_aa_change", [])
149 | if (
150 | isinstance(gene_aa_changes, list)
151 | and search_value not in gene_aa_changes
152 | ):
153 | # This SSM has multiple AA changes, but not the one we're looking for
154 | return None
155 |
156 | if not ssm_id:
157 | return None
158 |
159 | # Now query SSM occurrences to get project information
160 | occ_params = {
161 | "filters": json.dumps({
162 | "op": "in",
163 | "content": {"field": "ssm.ssm_id", "value": [ssm_id]},
164 | }),
165 | "fields": "case.project.project_id",
166 | "format": "json",
167 | "size": "2000", # Get more occurrences
168 | }
169 |
170 | occ_response, occ_error = await http_client.request_api(
171 | url="https://api.gdc.cancer.gov/ssm_occurrences",
172 | method="GET",
173 | request=occ_params,
174 | domain="gdc",
175 | )
176 |
177 | if occ_error or not occ_response:
178 | # Return basic info without occurrence data
179 | cosmic_id_str = (
180 | cosmic_id[0]
181 | if isinstance(cosmic_id, list) and cosmic_id
182 | else cosmic_id
183 | )
184 | return TCGAVariantData(
185 | cosmic_id=cosmic_id_str,
186 | tumor_types=[],
187 | affected_cases=0,
188 | consequence_type="missense_variant", # Most COSMIC variants are missense
189 | )
190 |
191 | # Process occurrence data
192 | occ_data = occ_response.get("data", {})
193 | occ_hits = occ_data.get("hits", [])
194 |
195 | # Count by project
196 | project_counts = {}
197 | for occ in occ_hits:
198 | case = occ.get("case", {})
199 | project = case.get("project", {})
200 | if project_id := project.get("project_id"):
201 | project_counts[project_id] = (
202 | project_counts.get(project_id, 0) + 1
203 | )
204 |
205 | # Extract tumor types
206 | tumor_types = []
207 | total_cases = 0
208 | for project_id, count in project_counts.items():
209 | # Extract tumor type from project ID
210 | # TCGA format: "TCGA-LUAD" -> "LUAD"
211 | # Other formats: "MMRF-COMMPASS" -> "MMRF-COMMPASS", "CPTAC-3" -> "CPTAC-3"
212 | if project_id.startswith("TCGA-") and "-" in project_id:
213 | tumor_type = project_id.split("-")[-1]
214 | tumor_types.append(tumor_type)
215 | else:
216 | # For non-TCGA projects, use the full project ID
217 | tumor_types.append(project_id)
218 | total_cases += count
219 |
220 | # Handle cosmic_id as list
221 | cosmic_id_str = (
222 | cosmic_id[0]
223 | if isinstance(cosmic_id, list) and cosmic_id
224 | else cosmic_id
225 | )
226 |
227 | return TCGAVariantData(
228 | cosmic_id=cosmic_id_str,
229 | tumor_types=tumor_types,
230 | affected_cases=total_cases,
231 | consequence_type="missense_variant", # Default for now
232 | )
233 |
234 | except (KeyError, ValueError, TypeError, IndexError) as e:
235 | # Log the error for debugging while gracefully handling API response issues
236 | # KeyError: Missing expected fields in API response
237 | # ValueError: Invalid data format or conversion issues
238 | # TypeError: Unexpected data types in response
239 | # IndexError: Array access issues with response data
240 | logger.warning(
241 | f"Failed to fetch TCGA variant data for {variant_id}: {type(e).__name__}: {e}"
242 | )
243 | return None
244 |
245 |
246 | class ThousandGenomesClient:
247 | """Client for 1000 Genomes data via Ensembl REST API."""
248 |
249 | def _extract_population_frequencies(
250 | self, populations: list[dict]
251 | ) -> dict[str, Any]:
252 | """Extract population frequencies from Ensembl response."""
253 | # Note: Multiple entries per population (one per allele), we want the alternate allele frequency
254 | # The reference allele will have higher frequency for rare variants
255 | pop_data: dict[str, float] = {}
256 |
257 | for pop in populations:
258 | pop_name = pop.get("population", "")
259 | frequency = pop.get("frequency", 0)
260 |
261 | # Map 1000 Genomes population codes - taking the minor allele frequency
262 | if pop_name == "1000GENOMES:phase_3:ALL":
263 | if "global_maf" not in pop_data or frequency < pop_data.get(
264 | "global_maf", 1
265 | ):
266 | pop_data["global_maf"] = frequency
267 | elif pop_name == "1000GENOMES:phase_3:AFR":
268 | if "afr_maf" not in pop_data or frequency < pop_data.get(
269 | "afr_maf", 1
270 | ):
271 | pop_data["afr_maf"] = frequency
272 | elif pop_name == "1000GENOMES:phase_3:AMR":
273 | if "amr_maf" not in pop_data or frequency < pop_data.get(
274 | "amr_maf", 1
275 | ):
276 | pop_data["amr_maf"] = frequency
277 | elif pop_name == "1000GENOMES:phase_3:EAS":
278 | if "eas_maf" not in pop_data or frequency < pop_data.get(
279 | "eas_maf", 1
280 | ):
281 | pop_data["eas_maf"] = frequency
282 | elif pop_name == "1000GENOMES:phase_3:EUR":
283 | if "eur_maf" not in pop_data or frequency < pop_data.get(
284 | "eur_maf", 1
285 | ):
286 | pop_data["eur_maf"] = frequency
287 | elif pop_name == "1000GENOMES:phase_3:SAS" and (
288 | "sas_maf" not in pop_data
289 | or frequency < pop_data.get("sas_maf", 1)
290 | ):
291 | pop_data["sas_maf"] = frequency
292 |
293 | return pop_data
294 |
295 | async def get_variant_data(
296 | self, variant_id: str
297 | ) -> ThousandGenomesData | None:
298 | """Fetch variant data from 1000 Genomes via Ensembl."""
299 | try:
300 | # Try to get rsID or use the variant ID directly
301 | encoded_id = quote(variant_id, safe="")
302 | url = f"{ENSEMBL_VARIATION_ENDPOINT}/{encoded_id}"
303 |
304 | # Request with pops=1 to get population data
305 | params = {"content-type": "application/json", "pops": "1"}
306 |
307 | response, error = await http_client.request_api(
308 | url=url,
309 | method="GET",
310 | request=params,
311 | domain="ensembl",
312 | )
313 |
314 | if error or not response:
315 | return None
316 |
317 | # Extract population frequencies
318 | populations = response.get("populations", [])
319 | pop_data = self._extract_population_frequencies(populations)
320 |
321 | # Get most severe consequence
322 | consequence = None
323 | if mappings := response.get("mappings", []):
324 | # Extract consequences from transcript consequences
325 | all_consequences = []
326 | for mapping in mappings:
327 | if transcript_consequences := mapping.get(
328 | "transcript_consequences", []
329 | ):
330 | for tc in transcript_consequences:
331 | if consequence_terms := tc.get(
332 | "consequence_terms", []
333 | ):
334 | all_consequences.extend(consequence_terms)
335 |
336 | if all_consequences:
337 | # Take the first unique consequence
338 | seen = set()
339 | unique_consequences = []
340 | for c in all_consequences:
341 | if c not in seen:
342 | seen.add(c)
343 | unique_consequences.append(c)
344 | consequence = (
345 | unique_consequences[0] if unique_consequences else None
346 | )
347 |
348 | # Only return data if we found population frequencies
349 | if pop_data:
350 | return ThousandGenomesData(
351 | **pop_data,
352 | ancestral_allele=response.get("ancestral_allele"),
353 | most_severe_consequence=consequence,
354 | )
355 | else:
356 | # No population data found
357 | return None
358 |
359 | except (KeyError, ValueError, TypeError, AttributeError) as e:
360 | # Log the error for debugging while gracefully handling API response issues
361 | # KeyError: Missing expected fields in API response
362 | # ValueError: Invalid data format or conversion issues
363 | # TypeError: Unexpected data types in response
364 | # AttributeError: Missing attributes on response objects
365 | logger.warning(
366 | f"Failed to fetch 1000 Genomes data for {variant_id}: {type(e).__name__}: {e}"
367 | )
368 | return None
369 |
370 |
371 | class ExternalVariantAggregator:
372 | """Aggregates variant data from multiple external sources."""
373 |
374 | def __init__(self):
375 | self.tcga_client = TCGAClient()
376 | self.thousand_genomes_client = ThousandGenomesClient()
377 | # Import here to avoid circular imports
378 | from .cbio_external_client import CBioPortalExternalClient
379 |
380 | self.cbioportal_client = CBioPortalExternalClient()
381 | self.oncokb_client = OncoKBClient()
382 |
383 | def _extract_gene_aa_change(
384 | self, variant_data: dict[str, Any]
385 | ) -> str | None:
386 | """Extract gene and AA change in format like 'BRAF V600A' from variant data."""
387 | logger.info("_extract_gene_aa_change called")
388 | try:
389 | # First try to get gene name from CADD data
390 | gene_name = None
391 | if (cadd := variant_data.get("cadd")) and (
392 | gene := cadd.get("gene")
393 | ):
394 | gene_name = gene.get("genename")
395 |
396 | # If not found in CADD, try other sources
397 | if not gene_name:
398 | # Try docm
399 | if docm := variant_data.get("docm"):
400 | gene_name = docm.get("gene") or docm.get("genename")
401 |
402 | # Try dbnsfp
403 | if not gene_name and (dbnsfp := variant_data.get("dbnsfp")):
404 | gene_name = dbnsfp.get("genename")
405 |
406 | if not gene_name:
407 | return None
408 |
409 | # Now try to get the protein change
410 | aa_change = None
411 |
412 | # Try to get from docm first (it has clean p.V600A format)
413 | if (docm := variant_data.get("docm")) and (
414 | aa := docm.get("aa_change")
415 | ):
416 | # Convert p.V600A to V600A
417 | aa_change = aa.replace("p.", "")
418 |
419 | # Try hgvsp if not found
420 | if (
421 | not aa_change
422 | and (hgvsp_list := variant_data.get("hgvsp"))
423 | and isinstance(hgvsp_list, list)
424 | and hgvsp_list
425 | ):
426 | # Take the first one and clean it
427 | hgvsp = hgvsp_list[0]
428 | # Remove p. prefix
429 | aa_change = hgvsp.replace("p.", "")
430 | # Handle formats like Val600Ala -> V600A
431 | if "Val" in aa_change or "Ala" in aa_change:
432 | # Try to extract the short form
433 | match = re.search(r"[A-Z]\d+[A-Z]", aa_change)
434 | if match:
435 | aa_change = match.group()
436 |
437 | # Try CADD data
438 | if (
439 | not aa_change
440 | and (cadd := variant_data.get("cadd"))
441 | and (gene_info := cadd.get("gene"))
442 | and (prot := gene_info.get("prot"))
443 | ):
444 | protpos = prot.get("protpos")
445 | if protpos and cadd.get("oaa") and cadd.get("naa"):
446 | aa_change = f"{cadd['oaa']}{protpos}{cadd['naa']}"
447 |
448 | if gene_name and aa_change:
449 | result = f"{gene_name} {aa_change}"
450 | logger.info(f"Extracted gene/AA change: {result}")
451 | return result
452 |
453 | logger.warning(
454 | f"Failed to extract gene/AA change: gene_name={gene_name}, aa_change={aa_change}"
455 | )
456 | return None
457 | except (
458 | KeyError,
459 | ValueError,
460 | TypeError,
461 | AttributeError,
462 | re.error,
463 | ) as e:
464 | # Log the error for debugging while gracefully handling data extraction issues
465 | # KeyError: Missing expected fields in variant data
466 | # ValueError: Invalid data format or conversion issues
467 | # TypeError: Unexpected data types in variant data
468 | # AttributeError: Missing attributes on data objects
469 | # re.error: Regular expression matching errors
470 | logger.warning(
471 | f"Failed to extract gene/AA change from variant data: {type(e).__name__}: {e}"
472 | )
473 | return None
474 |
475 | async def get_enhanced_annotations(
476 | self,
477 | variant_id: str,
478 | include_tcga: bool = True,
479 | include_1000g: bool = True,
480 | include_cbioportal: bool = True,
481 | include_oncokb: bool = True,
482 | variant_data: dict[str, Any] | None = None,
483 | ) -> EnhancedVariantAnnotation:
484 | """Fetch and aggregate variant annotations from external sources.
485 |
486 | Args:
487 | variant_id: The variant identifier (rsID or HGVS)
488 | include_tcga: Whether to include TCGA data
489 | include_1000g: Whether to include 1000 Genomes data
490 | include_cbioportal: Whether to include cBioPortal data
491 | include_oncokb: Whether to include OncoKB data
492 | variant_data: Optional variant data from MyVariant.info to extract gene/protein info
493 | """
494 | logger.info(
495 | f"get_enhanced_annotations called for {variant_id}, include_cbioportal={include_cbioportal}"
496 | )
497 | tasks: list[Any] = []
498 | task_names = []
499 |
500 | # Extract gene/AA change once for sources that need it
501 | gene_aa_change = None
502 | if variant_data:
503 | logger.info(
504 | f"Extracting gene/AA from variant_data keys: {list(variant_data.keys())}"
505 | )
506 | gene_aa_change = self._extract_gene_aa_change(variant_data)
507 | else:
508 | logger.warning("No variant_data provided for gene/AA extraction")
509 |
510 | if include_tcga:
511 | # Try to extract gene and protein change from variant data for TCGA
512 | tcga_id = gene_aa_change if gene_aa_change else variant_id
513 | tasks.append(self.tcga_client.get_variant_data(tcga_id))
514 | task_names.append("tcga")
515 |
516 | if include_1000g:
517 | tasks.append(
518 | self.thousand_genomes_client.get_variant_data(variant_id)
519 | )
520 | task_names.append("thousand_genomes")
521 |
522 | if include_cbioportal and gene_aa_change:
523 | # cBioPortal requires gene/AA format
524 | logger.info(
525 | f"Adding cBioPortal task with gene_aa_change: {gene_aa_change}"
526 | )
527 | tasks.append(
528 | self.cbioportal_client.get_variant_data(gene_aa_change)
529 | )
530 | task_names.append("cbioportal")
531 | elif include_cbioportal and not gene_aa_change:
532 | logger.warning(
533 | "Skipping cBioPortal: no gene/AA change could be extracted"
534 | )
535 |
536 | if include_oncokb and gene_aa_change:
537 | # OncoKB requires gene/AA format
538 | logger.info(
539 | f"Adding OncoKB task with gene_aa_change: {gene_aa_change}"
540 | )
541 | # Split gene_aa_change into gene and protein_change
542 | parts = gene_aa_change.split(" ", 1)
543 | if len(parts) == 2:
544 | gene, protein_change = parts
545 | tasks.append(
546 | self._get_oncokb_variant_data(gene, protein_change)
547 | )
548 | task_names.append("oncokb")
549 | elif include_oncokb and not gene_aa_change:
550 | logger.warning(
551 | "Skipping OncoKB: no gene/AA change could be extracted"
552 | )
553 |
554 | # Run all queries in parallel
555 | results = await asyncio.gather(*tasks, return_exceptions=True)
556 |
557 | # Build the enhanced annotation
558 | annotation = EnhancedVariantAnnotation(variant_id=variant_id)
559 |
560 | for _i, (result, name) in enumerate(
561 | zip(results, task_names, strict=False)
562 | ):
563 | if isinstance(result, Exception):
564 | annotation.error_sources.append(name)
565 | elif result is not None:
566 | setattr(annotation, name, result)
567 | else:
568 | # No data found for this source
569 | pass
570 |
571 | return annotation
572 |
573 | async def _get_oncokb_variant_data(
574 | self, gene: str, protein_change: str
575 | ) -> OncoKBVariantData | None:
576 | """Get OncoKB variant data and convert to simplified format.
577 |
578 | Args:
579 | gene: Gene symbol (e.g., "BRAF")
580 | protein_change: Protein change notation (e.g., "V600E")
581 |
582 | Returns:
583 | OncoKBVariantData or None if unavailable
584 | """
585 | try:
586 | (
587 | annotation,
588 | error,
589 | ) = await self.oncokb_client.get_variant_annotation(
590 | gene, protein_change
591 | )
592 |
593 | if error or not annotation:
594 | logger.warning(
595 | f"Failed to get OncoKB annotation for {gene} {protein_change}"
596 | )
597 | return None
598 |
599 | # Extract key fields into simplified model
600 | mutation_effect_obj = annotation.get("mutationEffect", {})
601 | mutation_effect = (
602 | mutation_effect_obj.get("knownEffect")
603 | if isinstance(mutation_effect_obj, dict)
604 | else None
605 | )
606 |
607 | return OncoKBVariantData(
608 | oncogenic=annotation.get("oncogenic"),
609 | mutation_effect=mutation_effect,
610 | highest_sensitive_level=annotation.get(
611 | "highestSensitiveLevel"
612 | ),
613 | highest_resistance_level=annotation.get(
614 | "highestResistanceLevel"
615 | ),
616 | treatments_count=len(annotation.get("treatments", [])),
617 | diagnostic_implications_count=len(
618 | annotation.get("diagnosticImplications", [])
619 | ),
620 | prognostic_implications_count=len(
621 | annotation.get("prognosticImplications", [])
622 | ),
623 | is_hotspot=annotation.get("hotspot", False),
624 | )
625 |
626 | except Exception as e:
627 | logger.warning(
628 | f"Error getting OncoKB data for {gene} {protein_change}: {e}"
629 | )
630 | return None
631 |
632 |
633 | def format_enhanced_annotations(
634 | annotation: EnhancedVariantAnnotation,
635 | ) -> dict[str, Any]:
636 | """Format enhanced annotations for display."""
637 | formatted: dict[str, Any] = {
638 | "variant_id": annotation.variant_id,
639 | "external_annotations": {},
640 | }
641 |
642 | external_annot = formatted["external_annotations"]
643 |
644 | if annotation.tcga:
645 | external_annot["tcga"] = {
646 | "tumor_types": annotation.tcga.tumor_types,
647 | "affected_cases": annotation.tcga.affected_cases,
648 | "cosmic_id": annotation.tcga.cosmic_id,
649 | "consequence": annotation.tcga.consequence_type,
650 | }
651 |
652 | if annotation.thousand_genomes:
653 | external_annot["1000_genomes"] = {
654 | "global_maf": annotation.thousand_genomes.global_maf,
655 | "population_frequencies": {
656 | "african": annotation.thousand_genomes.afr_maf,
657 | "american": annotation.thousand_genomes.amr_maf,
658 | "east_asian": annotation.thousand_genomes.eas_maf,
659 | "european": annotation.thousand_genomes.eur_maf,
660 | "south_asian": annotation.thousand_genomes.sas_maf,
661 | },
662 | "ancestral_allele": annotation.thousand_genomes.ancestral_allele,
663 | "consequence": annotation.thousand_genomes.most_severe_consequence,
664 | }
665 |
666 | if annotation.cbioportal:
667 | cbio_data: dict[str, Any] = {
668 | "studies": annotation.cbioportal.studies,
669 | "total_cases": annotation.cbioportal.total_cases,
670 | }
671 |
672 | # Add cancer type distribution if available
673 | if annotation.cbioportal.cancer_type_distribution:
674 | cbio_data["cancer_types"] = (
675 | annotation.cbioportal.cancer_type_distribution
676 | )
677 |
678 | # Add mutation type distribution if available
679 | if annotation.cbioportal.mutation_types:
680 | cbio_data["mutation_types"] = annotation.cbioportal.mutation_types
681 |
682 | # Add hotspot count if > 0
683 | if annotation.cbioportal.hotspot_count > 0:
684 | cbio_data["hotspot_samples"] = annotation.cbioportal.hotspot_count
685 |
686 | # Add mean VAF if available
687 | if annotation.cbioportal.mean_vaf is not None:
688 | cbio_data["mean_vaf"] = annotation.cbioportal.mean_vaf
689 |
690 | # Add sample type distribution if available
691 | if annotation.cbioportal.sample_types:
692 | cbio_data["sample_types"] = annotation.cbioportal.sample_types
693 |
694 | external_annot["cbioportal"] = cbio_data
695 |
696 | if annotation.oncokb:
697 | oncokb_data: dict[str, Any] = {
698 | "oncogenic": annotation.oncokb.oncogenic,
699 | "mutation_effect": annotation.oncokb.mutation_effect,
700 | }
701 |
702 | # Add evidence levels if available
703 | if annotation.oncokb.highest_sensitive_level:
704 | oncokb_data["highest_sensitive_level"] = (
705 | annotation.oncokb.highest_sensitive_level
706 | )
707 | if annotation.oncokb.highest_resistance_level:
708 | oncokb_data["highest_resistance_level"] = (
709 | annotation.oncokb.highest_resistance_level
710 | )
711 |
712 | # Add counts if > 0
713 | if annotation.oncokb.treatments_count > 0:
714 | oncokb_data["treatments_count"] = (
715 | annotation.oncokb.treatments_count
716 | )
717 | if annotation.oncokb.diagnostic_implications_count > 0:
718 | oncokb_data["diagnostic_implications_count"] = (
719 | annotation.oncokb.diagnostic_implications_count
720 | )
721 | if annotation.oncokb.prognostic_implications_count > 0:
722 | oncokb_data["prognostic_implications_count"] = (
723 | annotation.oncokb.prognostic_implications_count
724 | )
725 |
726 | # Add hotspot flag if true
727 | if annotation.oncokb.is_hotspot:
728 | oncokb_data["is_hotspot"] = annotation.oncokb.is_hotspot
729 |
730 | external_annot["oncokb"] = oncokb_data
731 |
732 | if annotation.error_sources:
733 | external_annot["errors"] = annotation.error_sources
734 |
735 | return formatted
736 |
```
--------------------------------------------------------------------------------
/src/biomcp/utils/endpoint_registry.py:
--------------------------------------------------------------------------------
```python
1 | """Registry for tracking all external HTTP endpoints used by BioMCP."""
2 |
3 | from dataclasses import dataclass, field
4 | from enum import Enum
5 | from pathlib import Path
6 | from typing import Any
7 | from urllib.parse import urlparse
8 |
9 |
10 | class EndpointCategory(str, Enum):
11 | """Categories of external endpoints."""
12 |
13 | BIOMEDICAL_LITERATURE = "biomedical_literature"
14 | CLINICAL_TRIALS = "clinical_trials"
15 | VARIANT_DATABASES = "variant_databases"
16 | CANCER_GENOMICS = "cancer_genomics"
17 | HEALTH_MONITORING = "health_monitoring"
18 | REGULATORY_DATA = "regulatory_data"
19 |
20 |
21 | class DataType(str, Enum):
22 | """Types of data accessed from endpoints."""
23 |
24 | RESEARCH_ARTICLES = "research_articles"
25 | CLINICAL_TRIAL_DATA = "clinical_trial_data"
26 | GENETIC_VARIANTS = "genetic_variants"
27 | CANCER_MUTATIONS = "cancer_mutations"
28 | GENE_ANNOTATIONS = "gene_annotations"
29 | SERVICE_STATUS = "service_status"
30 | ADVERSE_EVENTS = "adverse_events"
31 | DRUG_LABELS = "drug_labels"
32 | DEVICE_EVENTS = "device_events"
33 |
34 |
35 | @dataclass
36 | class EndpointInfo:
37 | """Information about an external endpoint."""
38 |
39 | url: str
40 | category: EndpointCategory
41 | data_types: list[DataType] = field(default_factory=list)
42 | description: str = ""
43 | compliance_notes: str = ""
44 | rate_limit: str | None = None
45 | authentication: str | None = None
46 |
47 | @property
48 | def domain(self) -> str:
49 | """Extract domain from URL."""
50 | parsed = urlparse(self.url)
51 | return parsed.netloc
52 |
53 |
54 | class EndpointRegistry:
55 | """Registry for tracking all external endpoints."""
56 |
57 | def __init__(self):
58 | self._endpoints: dict[str, EndpointInfo] = {}
59 | self._initialize_known_endpoints()
60 |
61 | def _initialize_known_endpoints(self):
62 | """Initialize registry with known endpoints."""
63 | # PubMed/PubTator3
64 | self.register(
65 | "pubtator3_search",
66 | EndpointInfo(
67 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/",
68 | category=EndpointCategory.BIOMEDICAL_LITERATURE,
69 | data_types=[DataType.RESEARCH_ARTICLES],
70 | description="PubTator3 API for searching biomedical literature with entity annotations",
71 | compliance_notes="Public NIH/NCBI service, no PII transmitted",
72 | rate_limit="20 requests/second",
73 | ),
74 | )
75 |
76 | self.register(
77 | "pubtator3_export",
78 | EndpointInfo(
79 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson",
80 | category=EndpointCategory.BIOMEDICAL_LITERATURE,
81 | data_types=[DataType.RESEARCH_ARTICLES],
82 | description="PubTator3 API for fetching full article annotations in BioC-JSON format",
83 | compliance_notes="Public NIH/NCBI service, no PII transmitted",
84 | rate_limit="20 requests/second",
85 | ),
86 | )
87 |
88 | self.register(
89 | "pubtator3_autocomplete",
90 | EndpointInfo(
91 | url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/entity/autocomplete/",
92 | category=EndpointCategory.BIOMEDICAL_LITERATURE,
93 | data_types=[DataType.GENE_ANNOTATIONS],
94 | description="PubTator3 API for entity name autocomplete suggestions",
95 | compliance_notes="Public NIH/NCBI service, no PII transmitted",
96 | rate_limit="20 requests/second",
97 | ),
98 | )
99 |
100 | # ClinicalTrials.gov
101 | self.register(
102 | "clinicaltrials_search",
103 | EndpointInfo(
104 | url="https://clinicaltrials.gov/api/v2/studies",
105 | category=EndpointCategory.CLINICAL_TRIALS,
106 | data_types=[DataType.CLINICAL_TRIAL_DATA],
107 | description="ClinicalTrials.gov API v2 for searching clinical trials",
108 | compliance_notes="Public NIH service, may contain trial participant criteria",
109 | rate_limit="10 requests/second",
110 | ),
111 | )
112 |
113 | # MyVariant.info
114 | self.register(
115 | "myvariant_query",
116 | EndpointInfo(
117 | url="https://myvariant.info/v1/query",
118 | category=EndpointCategory.VARIANT_DATABASES,
119 | data_types=[DataType.GENETIC_VARIANTS],
120 | description="MyVariant.info API for querying genetic variants",
121 | compliance_notes="Public service aggregating variant databases, no patient data",
122 | rate_limit="1000 requests/hour (anonymous)",
123 | ),
124 | )
125 |
126 | self.register(
127 | "myvariant_variant",
128 | EndpointInfo(
129 | url="https://myvariant.info/v1/variant",
130 | category=EndpointCategory.VARIANT_DATABASES,
131 | data_types=[DataType.GENETIC_VARIANTS],
132 | description="MyVariant.info API for fetching specific variant details",
133 | compliance_notes="Public service aggregating variant databases, no patient data",
134 | rate_limit="1000 requests/hour (anonymous)",
135 | ),
136 | )
137 |
138 | # Preprint servers
139 | self.register(
140 | "biorxiv_api",
141 | EndpointInfo(
142 | url="https://api.biorxiv.org/details/biorxiv",
143 | category=EndpointCategory.BIOMEDICAL_LITERATURE,
144 | data_types=[DataType.RESEARCH_ARTICLES],
145 | description="bioRxiv API for searching biology preprints",
146 | compliance_notes="Public preprint server, no PII transmitted",
147 | rate_limit="Not specified",
148 | ),
149 | )
150 |
151 | self.register(
152 | "medrxiv_api",
153 | EndpointInfo(
154 | url="https://api.biorxiv.org/details/medrxiv",
155 | category=EndpointCategory.BIOMEDICAL_LITERATURE,
156 | data_types=[DataType.RESEARCH_ARTICLES],
157 | description="medRxiv API for searching medical preprints",
158 | compliance_notes="Public preprint server, no PII transmitted",
159 | rate_limit="Not specified",
160 | ),
161 | )
162 |
163 | self.register(
164 | "europe_pmc",
165 | EndpointInfo(
166 | url="https://www.ebi.ac.uk/europepmc/webservices/rest/search",
167 | category=EndpointCategory.BIOMEDICAL_LITERATURE,
168 | data_types=[DataType.RESEARCH_ARTICLES],
169 | description="Europe PMC REST API for searching biomedical literature",
170 | compliance_notes="Public EMBL-EBI service, no PII transmitted",
171 | rate_limit="Not specified",
172 | ),
173 | )
174 |
175 | # External variant sources
176 | self.register(
177 | "gdc_ssms",
178 | EndpointInfo(
179 | url="https://api.gdc.cancer.gov/ssms",
180 | category=EndpointCategory.VARIANT_DATABASES,
181 | data_types=[DataType.CANCER_MUTATIONS],
182 | description="NCI GDC API for somatic mutations",
183 | compliance_notes="Public NCI service, aggregate cancer genomics data",
184 | rate_limit="Not specified",
185 | ),
186 | )
187 |
188 | self.register(
189 | "gdc_ssm_occurrences",
190 | EndpointInfo(
191 | url="https://api.gdc.cancer.gov/ssm_occurrences",
192 | category=EndpointCategory.VARIANT_DATABASES,
193 | data_types=[DataType.CANCER_MUTATIONS],
194 | description="NCI GDC API for mutation occurrences in cancer samples",
195 | compliance_notes="Public NCI service, aggregate cancer genomics data",
196 | rate_limit="Not specified",
197 | ),
198 | )
199 |
200 | self.register(
201 | "ensembl_variation",
202 | EndpointInfo(
203 | url="https://rest.ensembl.org/variation/human",
204 | category=EndpointCategory.VARIANT_DATABASES,
205 | data_types=[DataType.GENETIC_VARIANTS],
206 | description="Ensembl REST API for human genetic variation data",
207 | compliance_notes="Public EMBL-EBI service, population genetics data",
208 | rate_limit="15 requests/second",
209 | ),
210 | )
211 |
212 | self.register(
213 | "cbioportal_api",
214 | EndpointInfo(
215 | url="https://www.cbioportal.org/api",
216 | category=EndpointCategory.CANCER_GENOMICS,
217 | data_types=[
218 | DataType.CANCER_MUTATIONS,
219 | DataType.CLINICAL_TRIAL_DATA,
220 | ],
221 | description="cBioPortal API for cancer genomics data",
222 | compliance_notes="Public MSKCC/Dana-Farber service, aggregate cancer genomics",
223 | rate_limit="5 requests/second",
224 | authentication="Optional API token for increased rate limits",
225 | ),
226 | )
227 |
228 | # Specific cBioPortal endpoints
229 | self.register(
230 | "cbioportal_genes",
231 | EndpointInfo(
232 | url="https://www.cbioportal.org/api/genes",
233 | category=EndpointCategory.CANCER_GENOMICS,
234 | data_types=[DataType.GENE_ANNOTATIONS],
235 | description="cBioPortal API for gene information",
236 | compliance_notes="Public MSKCC/Dana-Farber service, gene metadata",
237 | rate_limit="5 requests/second",
238 | ),
239 | )
240 |
241 | self.register(
242 | "cbioportal_cancer_types",
243 | EndpointInfo(
244 | url="https://www.cbioportal.org/api/cancer-types",
245 | category=EndpointCategory.CANCER_GENOMICS,
246 | data_types=[DataType.CANCER_MUTATIONS],
247 | description="cBioPortal API for cancer type hierarchy",
248 | compliance_notes="Public MSKCC/Dana-Farber service, cancer type metadata",
249 | rate_limit="5 requests/second",
250 | ),
251 | )
252 |
253 | self.register(
254 | "cbioportal_molecular_profiles",
255 | EndpointInfo(
256 | url="https://www.cbioportal.org/api/molecular-profiles",
257 | category=EndpointCategory.CANCER_GENOMICS,
258 | data_types=[DataType.CANCER_MUTATIONS],
259 | description="cBioPortal API for molecular profiles",
260 | compliance_notes="Public MSKCC/Dana-Farber service, study metadata",
261 | rate_limit="5 requests/second",
262 | ),
263 | )
264 |
265 | self.register(
266 | "cbioportal_mutations",
267 | EndpointInfo(
268 | url="https://www.cbioportal.org/api/mutations",
269 | category=EndpointCategory.CANCER_GENOMICS,
270 | data_types=[DataType.CANCER_MUTATIONS],
271 | description="cBioPortal API for mutation data",
272 | compliance_notes="Public MSKCC/Dana-Farber service, aggregate mutation data",
273 | rate_limit="5 requests/second",
274 | ),
275 | )
276 |
277 | self.register(
278 | "cbioportal_studies",
279 | EndpointInfo(
280 | url="https://www.cbioportal.org/api/studies",
281 | category=EndpointCategory.CANCER_GENOMICS,
282 | data_types=[
283 | DataType.CLINICAL_TRIAL_DATA,
284 | DataType.CANCER_MUTATIONS,
285 | ],
286 | description="cBioPortal API for cancer studies",
287 | compliance_notes="Public MSKCC/Dana-Farber service, study metadata",
288 | rate_limit="5 requests/second",
289 | ),
290 | )
291 |
292 | # OncoKB API endpoints (demo server by default)
293 | self.register(
294 | "oncokb_curated_genes",
295 | EndpointInfo(
296 | url="https://demo.oncokb.org/api/v1/utils/allCuratedGenes",
297 | category=EndpointCategory.CANCER_GENOMICS,
298 | data_types=[DataType.GENE_ANNOTATIONS],
299 | description="OncoKB demo API for retrieving curated cancer genes (BRAF, ROS1, TP53)",
300 | compliance_notes="Public MSK OncoKB demo service, no authentication required. Production (www.oncokb.org) requires ONCOKB_TOKEN.",
301 | rate_limit="Not specified",
302 | authentication="None (demo server). Set ONCOKB_TOKEN for production access.",
303 | ),
304 | )
305 |
306 | self.register(
307 | "oncokb_gene_annotation",
308 | EndpointInfo(
309 | url="https://demo.oncokb.org/api/v1/genes/{gene}",
310 | category=EndpointCategory.CANCER_GENOMICS,
311 | data_types=[DataType.GENE_ANNOTATIONS],
312 | description="OncoKB demo API for gene-level annotations and therapeutic implications",
313 | compliance_notes="Public MSK OncoKB demo service, limited to 3 genes. Production (www.oncokb.org) requires ONCOKB_TOKEN.",
314 | rate_limit="Not specified",
315 | authentication="None (demo server). Set ONCOKB_TOKEN for production access.",
316 | ),
317 | )
318 |
319 | self.register(
320 | "oncokb_variant_annotation",
321 | EndpointInfo(
322 | url="https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange",
323 | category=EndpointCategory.CANCER_GENOMICS,
324 | data_types=[
325 | DataType.GENE_ANNOTATIONS,
326 | DataType.CANCER_MUTATIONS,
327 | ],
328 | description="OncoKB demo API for variant-level annotations with clinical actionability",
329 | compliance_notes="Public MSK OncoKB demo service, works for BRAF/ROS1/TP53 variants. Production (www.oncokb.org) requires ONCOKB_TOKEN.",
330 | rate_limit="Not specified",
331 | authentication="None (demo server). Set ONCOKB_TOKEN for production access.",
332 | ),
333 | )
334 |
335 | # BioThings Suite APIs
336 | self.register(
337 | "mygene_query",
338 | EndpointInfo(
339 | url="https://mygene.info/v3/query",
340 | category=EndpointCategory.VARIANT_DATABASES,
341 | data_types=[DataType.GENE_ANNOTATIONS],
342 | description="MyGene.info API for querying gene information",
343 | compliance_notes="Public BioThings service, gene annotation data",
344 | rate_limit="10 requests/second",
345 | ),
346 | )
347 |
348 | self.register(
349 | "mygene_gene",
350 | EndpointInfo(
351 | url="https://mygene.info/v3/gene",
352 | category=EndpointCategory.VARIANT_DATABASES,
353 | data_types=[DataType.GENE_ANNOTATIONS],
354 | description="MyGene.info API for fetching specific gene details",
355 | compliance_notes="Public BioThings service, gene annotation data",
356 | rate_limit="10 requests/second",
357 | ),
358 | )
359 |
360 | self.register(
361 | "mydisease_query",
362 | EndpointInfo(
363 | url="https://mydisease.info/v1/query",
364 | category=EndpointCategory.VARIANT_DATABASES,
365 | data_types=[DataType.GENE_ANNOTATIONS],
366 | description="MyDisease.info API for querying disease information",
367 | compliance_notes="Public BioThings service, disease ontology data",
368 | rate_limit="10 requests/second",
369 | ),
370 | )
371 |
372 | self.register(
373 | "mydisease_disease",
374 | EndpointInfo(
375 | url="https://mydisease.info/v1/disease",
376 | category=EndpointCategory.VARIANT_DATABASES,
377 | data_types=[DataType.GENE_ANNOTATIONS],
378 | description="MyDisease.info API for fetching specific disease details",
379 | compliance_notes="Public BioThings service, disease ontology data",
380 | rate_limit="10 requests/second",
381 | ),
382 | )
383 |
384 | self.register(
385 | "mychem_query",
386 | EndpointInfo(
387 | url="https://mychem.info/v1/query",
388 | category=EndpointCategory.VARIANT_DATABASES,
389 | data_types=[DataType.GENE_ANNOTATIONS],
390 | description="MyChem.info API for querying drug/chemical information",
391 | compliance_notes="Public BioThings service, drug/chemical annotation data",
392 | rate_limit="10 requests/second",
393 | ),
394 | )
395 |
396 | self.register(
397 | "mychem_chem",
398 | EndpointInfo(
399 | url="https://mychem.info/v1/chem",
400 | category=EndpointCategory.VARIANT_DATABASES,
401 | data_types=[DataType.GENE_ANNOTATIONS],
402 | description="MyChem.info API for fetching specific drug/chemical details",
403 | compliance_notes="Public BioThings service, drug/chemical annotation data",
404 | rate_limit="10 requests/second",
405 | ),
406 | )
407 |
408 | # NCI Clinical Trials Search API
409 | self.register(
410 | "nci_trials",
411 | EndpointInfo(
412 | url="https://clinicaltrialsapi.cancer.gov/api/v2/trials",
413 | category=EndpointCategory.CLINICAL_TRIALS,
414 | data_types=[DataType.CLINICAL_TRIAL_DATA],
415 | description="NCI Clinical Trials Search API for cancer trials",
416 | compliance_notes="Public NCI service, cancer trial data",
417 | rate_limit="Not specified",
418 | authentication="Optional NCI_API_KEY for increased access",
419 | ),
420 | )
421 |
422 | self.register(
423 | "nci_organizations",
424 | EndpointInfo(
425 | url="https://clinicaltrialsapi.cancer.gov/api/v2/organizations",
426 | category=EndpointCategory.CLINICAL_TRIALS,
427 | data_types=[DataType.CLINICAL_TRIAL_DATA],
428 | description="NCI API for cancer research organizations",
429 | compliance_notes="Public NCI service, organization metadata",
430 | rate_limit="Not specified",
431 | authentication="Optional NCI_API_KEY for increased access",
432 | ),
433 | )
434 |
435 | self.register(
436 | "nci_diseases",
437 | EndpointInfo(
438 | url="https://clinicaltrialsapi.cancer.gov/api/v2/diseases",
439 | category=EndpointCategory.CLINICAL_TRIALS,
440 | data_types=[DataType.CLINICAL_TRIAL_DATA],
441 | description="NCI API for cancer disease vocabulary",
442 | compliance_notes="Public NCI service, disease ontology",
443 | rate_limit="Not specified",
444 | authentication="Optional NCI_API_KEY for increased access",
445 | ),
446 | )
447 |
448 | self.register(
449 | "nci_interventions",
450 | EndpointInfo(
451 | url="https://clinicaltrialsapi.cancer.gov/api/v2/interventions",
452 | category=EndpointCategory.CLINICAL_TRIALS,
453 | data_types=[DataType.CLINICAL_TRIAL_DATA],
454 | description="NCI API for cancer treatment interventions",
455 | compliance_notes="Public NCI service, intervention metadata",
456 | rate_limit="Not specified",
457 | authentication="Optional NCI_API_KEY for increased access",
458 | ),
459 | )
460 |
461 | self.register(
462 | "nci_biomarkers",
463 | EndpointInfo(
464 | url="https://clinicaltrialsapi.cancer.gov/api/v2/biomarkers",
465 | category=EndpointCategory.CLINICAL_TRIALS,
466 | data_types=[DataType.CLINICAL_TRIAL_DATA],
467 | description="NCI API for biomarkers used in clinical trials",
468 | compliance_notes="Public NCI service, biomarker metadata",
469 | rate_limit="Not specified",
470 | authentication="Optional NCI_API_KEY for increased access",
471 | ),
472 | )
473 |
474 | # OpenFDA APIs
475 | self.register(
476 | "openfda_drug_events",
477 | EndpointInfo(
478 | url="https://api.fda.gov/drug/event.json",
479 | category=EndpointCategory.REGULATORY_DATA,
480 | data_types=[DataType.ADVERSE_EVENTS],
481 | description="FDA Adverse Event Reporting System (FAERS) for drug safety data",
482 | compliance_notes="Public FDA service, voluntary adverse event reports, no PII",
483 | rate_limit="40 requests/minute (240 with API key)",
484 | authentication="Optional OPENFDA_API_KEY for increased rate limits",
485 | ),
486 | )
487 |
488 | self.register(
489 | "openfda_drug_labels",
490 | EndpointInfo(
491 | url="https://api.fda.gov/drug/label.json",
492 | category=EndpointCategory.REGULATORY_DATA,
493 | data_types=[DataType.DRUG_LABELS],
494 | description="FDA Structured Product Labeling (SPL) for drug prescribing information",
495 | compliance_notes="Public FDA service, official drug labeling data",
496 | rate_limit="40 requests/minute (240 with API key)",
497 | authentication="Optional OPENFDA_API_KEY for increased rate limits",
498 | ),
499 | )
500 |
501 | self.register(
502 | "openfda_device_events",
503 | EndpointInfo(
504 | url="https://api.fda.gov/device/event.json",
505 | category=EndpointCategory.REGULATORY_DATA,
506 | data_types=[DataType.DEVICE_EVENTS],
507 | description="FDA MAUDE database for medical device adverse events",
508 | compliance_notes="Public FDA service, device malfunction and adverse event reports",
509 | rate_limit="40 requests/minute (240 with API key)",
510 | authentication="Optional OPENFDA_API_KEY for increased rate limits",
511 | ),
512 | )
513 |
514 | self.register(
515 | "openfda_drugsfda",
516 | EndpointInfo(
517 | url="https://api.fda.gov/drug/drugsfda.json",
518 | category=EndpointCategory.REGULATORY_DATA,
519 | data_types=[DataType.DRUG_LABELS],
520 | description="FDA Drugs@FDA database for drug approval information",
521 | compliance_notes="Public FDA service, official drug approval records",
522 | rate_limit="40 requests/minute (240 with API key)",
523 | authentication="Optional OPENFDA_API_KEY for increased rate limits",
524 | ),
525 | )
526 |
527 | self.register(
528 | "openfda_drug_enforcement",
529 | EndpointInfo(
530 | url="https://api.fda.gov/drug/enforcement.json",
531 | category=EndpointCategory.REGULATORY_DATA,
532 | data_types=[DataType.ADVERSE_EVENTS],
533 | description="FDA Enforcement database for drug recall information",
534 | compliance_notes="Public FDA service, drug recall and enforcement actions",
535 | rate_limit="40 requests/minute (240 with API key)",
536 | authentication="Optional OPENFDA_API_KEY for increased rate limits",
537 | ),
538 | )
539 |
540 | # Note: Drug shortage endpoint is not yet available via OpenFDA
541 | # Using placeholder for future migration when FDA provides official endpoint
542 | self.register(
543 | "fda_drug_shortages",
544 | EndpointInfo(
545 | url="https://www.fda.gov/media/169066/download",
546 | category=EndpointCategory.REGULATORY_DATA,
547 | data_types=[DataType.DRUG_LABELS],
548 | description="FDA Drug Shortages database (cached locally)",
549 | compliance_notes="Public FDA service, drug shortage status information",
550 | rate_limit="Cached with 24-hour TTL",
551 | authentication="None required",
552 | ),
553 | )
554 |
555 | def register(self, key: str, endpoint: EndpointInfo):
556 | """Register an endpoint for tracking.
557 |
558 | Args:
559 | key: Unique identifier for the endpoint
560 | endpoint: Endpoint metadata including URL, description, and compliance notes
561 | """
562 | self._endpoints[key] = endpoint
563 |
564 | def get_all_endpoints(self) -> dict[str, EndpointInfo]:
565 | """Get all registered endpoints.
566 |
567 | Returns:
568 | Dictionary mapping endpoint keys to their metadata
569 | """
570 | return self._endpoints.copy()
571 |
572 | def get_endpoints_by_category(
573 | self, category: EndpointCategory
574 | ) -> dict[str, EndpointInfo]:
575 | """Get endpoints filtered by category.
576 |
577 | Args:
578 | category: The category to filter by
579 |
580 | Returns:
581 | Dictionary of endpoints belonging to the specified category
582 | """
583 | return {
584 | key: info
585 | for key, info in self._endpoints.items()
586 | if info.category == category
587 | }
588 |
589 | def get_unique_domains(self) -> set[str]:
590 | """Get all unique domains accessed by BioMCP.
591 |
592 | Returns:
593 | Set of unique domain names (e.g., 'api.ncbi.nlm.nih.gov')
594 | """
595 | return {info.domain for info in self._endpoints.values()}
596 |
597 | def generate_markdown_report(self) -> str:
598 | """Generate markdown documentation of all endpoints."""
599 | lines = [
600 | "# Third-Party Endpoints Used by BioMCP",
601 | "",
602 | "_This file is auto-generated from the endpoint registry._",
603 | "",
604 | "## Overview",
605 | "",
606 | f"BioMCP connects to {len(self.get_unique_domains())} external domains across {len(self._endpoints)} endpoints.",
607 | "",
608 | "## Endpoints by Category",
609 | "",
610 | ]
611 |
612 | # Group by category
613 | for category in EndpointCategory:
614 | endpoints = self.get_endpoints_by_category(category)
615 | if not endpoints:
616 | continue
617 |
618 | lines.append(f"### {category.value.replace('_', ' ').title()}")
619 | lines.append("")
620 |
621 | for key, info in sorted(endpoints.items()):
622 | lines.append(f"#### {key}")
623 | lines.append("")
624 | lines.append(f"- **URL**: `{info.url}`")
625 | lines.append(f"- **Description**: {info.description}")
626 | lines.append(
627 | f"- **Data Types**: {', '.join(dt.value for dt in info.data_types)}"
628 | )
629 | lines.append(
630 | f"- **Rate Limit**: {info.rate_limit or 'Not specified'}"
631 | )
632 |
633 | if info.authentication:
634 | lines.append(
635 | f"- **Authentication**: {info.authentication}"
636 | )
637 |
638 | if info.compliance_notes:
639 | lines.append(
640 | f"- **Compliance Notes**: {info.compliance_notes}"
641 | )
642 |
643 | lines.append("")
644 |
645 | # Add summary section
646 | lines.extend([
647 | "## Domain Summary",
648 | "",
649 | "| Domain | Category | Endpoints |",
650 | "| -------------------- | --------------------- | --------- |",
651 | ])
652 |
653 | domain_stats: dict[str, dict[str, Any]] = {}
654 | for info in self._endpoints.values():
655 | domain = info.domain
656 | if domain not in domain_stats:
657 | domain_stats[domain] = {
658 | "category": info.category.value,
659 | "count": 0,
660 | }
661 | domain_stats[domain]["count"] = (
662 | int(domain_stats[domain]["count"]) + 1
663 | )
664 |
665 | for domain, stats in sorted(domain_stats.items()):
666 | lines.append(
667 | f"| {domain} | {stats['category']} | {stats['count']} |"
668 | )
669 |
670 | lines.extend([
671 | "",
672 | "## Compliance and Privacy",
673 | "",
674 | "All endpoints accessed by BioMCP:",
675 | "",
676 | "- Use publicly available APIs",
677 | "- Do not transmit personally identifiable information (PII)",
678 | "- Access only aggregate or de-identified data",
679 | "- Comply with respective terms of service",
680 | "",
681 | "## Network Control",
682 | "",
683 | "For air-gapped or restricted environments, BioMCP supports:",
684 | "",
685 | "- Offline mode via `BIOMCP_OFFLINE=true` environment variable",
686 | "- Custom proxy configuration via standard HTTP(S)\\_PROXY variables",
687 | "- SSL certificate pinning for enhanced security",
688 | "",
689 | ])
690 |
691 | return "\n".join(lines)
692 |
693 | def save_markdown_report(self, output_path: Path | None = None):
694 | """Save markdown report to file."""
695 | if output_path is None:
696 | output_path = (
697 | Path(__file__).parent.parent.parent
698 | / "THIRD_PARTY_ENDPOINTS.md"
699 | )
700 |
701 | output_path.write_text(self.generate_markdown_report())
702 | return output_path
703 |
704 |
705 | # Global registry instance
706 | _registry = EndpointRegistry()
707 |
708 |
709 | def get_registry() -> EndpointRegistry:
710 | """Get the global endpoint registry."""
711 | return _registry
712 |
```
--------------------------------------------------------------------------------
/src/biomcp/trials/search.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import logging
3 | from ssl import TLSVersion
4 | from typing import Annotated
5 |
6 | from pydantic import BaseModel, Field, field_validator, model_validator
7 |
8 | from .. import StrEnum, ensure_list, http_client, render
9 | from ..constants import CLINICAL_TRIALS_BASE_URL
10 | from ..integrations import BioThingsClient
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class SortOrder(StrEnum):
16 | RELEVANCE = "RELEVANCE"
17 | LAST_UPDATE = "LAST_UPDATE"
18 | ENROLLMENT = "ENROLLMENT"
19 | START_DATE = "START_DATE"
20 | COMPLETION_DATE = "COMPLETION_DATE"
21 | SUBMITTED_DATE = "SUBMITTED_DATE"
22 |
23 |
24 | class TrialPhase(StrEnum):
25 | EARLY_PHASE1 = "EARLY_PHASE1"
26 | PHASE1 = "PHASE1"
27 | PHASE2 = "PHASE2"
28 | PHASE3 = "PHASE3"
29 | PHASE4 = "PHASE4"
30 | NOT_APPLICABLE = "NOT_APPLICABLE"
31 |
32 |
33 | class RecruitingStatus(StrEnum):
34 | OPEN = "OPEN"
35 | CLOSED = "CLOSED"
36 | ANY = "ANY"
37 |
38 |
39 | class StudyType(StrEnum):
40 | INTERVENTIONAL = "INTERVENTIONAL"
41 | OBSERVATIONAL = "OBSERVATIONAL"
42 | EXPANDED_ACCESS = "EXPANDED_ACCESS"
43 | OTHER = "OTHER"
44 |
45 |
46 | class InterventionType(StrEnum):
47 | DRUG = "DRUG"
48 | DEVICE = "DEVICE"
49 | BIOLOGICAL = "BIOLOGICAL"
50 | PROCEDURE = "PROCEDURE"
51 | RADIATION = "RADIATION"
52 | BEHAVIORAL = "BEHAVIORAL"
53 | GENETIC = "GENETIC"
54 | DIETARY = "DIETARY"
55 | DIAGNOSTIC_TEST = "DIAGNOSTIC_TEST"
56 | OTHER = "OTHER"
57 |
58 |
59 | class SponsorType(StrEnum):
60 | INDUSTRY = "INDUSTRY"
61 | GOVERNMENT = "GOVERNMENT"
62 | ACADEMIC = "ACADEMIC"
63 | OTHER = "OTHER"
64 |
65 |
66 | class StudyDesign(StrEnum):
67 | RANDOMIZED = "RANDOMIZED"
68 | NON_RANDOMIZED = "NON_RANDOMIZED"
69 | OBSERVATIONAL = "OBSERVATIONAL"
70 |
71 |
72 | class DateField(StrEnum):
73 | LAST_UPDATE = "LAST_UPDATE"
74 | STUDY_START = "STUDY_START"
75 | PRIMARY_COMPLETION = "PRIMARY_COMPLETION"
76 | OUTCOME_POSTING = "OUTCOME_POSTING"
77 | COMPLETION = "COMPLETION"
78 | FIRST_POSTING = "FIRST_POSTING"
79 | SUBMITTED_DATE = "SUBMITTED_DATE"
80 |
81 |
82 | class PrimaryPurpose(StrEnum):
83 | TREATMENT = "TREATMENT"
84 | PREVENTION = "PREVENTION"
85 | DIAGNOSTIC = "DIAGNOSTIC"
86 | SUPPORTIVE_CARE = "SUPPORTIVE_CARE"
87 | SCREENING = "SCREENING"
88 | HEALTH_SERVICES = "HEALTH_SERVICES"
89 | BASIC_SCIENCE = "BASIC_SCIENCE"
90 | DEVICE_FEASIBILITY = "DEVICE_FEASIBILITY"
91 | OTHER = "OTHER"
92 |
93 |
94 | class AgeGroup(StrEnum):
95 | CHILD = "CHILD"
96 | ADULT = "ADULT"
97 | SENIOR = "SENIOR"
98 | ALL = "ALL"
99 |
100 |
101 | class LineOfTherapy(StrEnum):
102 | FIRST_LINE = "1L"
103 | SECOND_LINE = "2L"
104 | THIRD_LINE_PLUS = "3L+"
105 |
106 |
107 | CTGOV_SORT_MAPPING = {
108 | SortOrder.RELEVANCE: "@relevance",
109 | SortOrder.LAST_UPDATE: "LastUpdatePostDate:desc",
110 | SortOrder.ENROLLMENT: "EnrollmentCount:desc",
111 | SortOrder.START_DATE: "StudyStartDate:desc",
112 | SortOrder.COMPLETION_DATE: "PrimaryCompletionDate:desc",
113 | SortOrder.SUBMITTED_DATE: "StudyFirstSubmitDate:desc",
114 | }
115 |
116 | CTGOV_PHASE_MAPPING = {
117 | TrialPhase.EARLY_PHASE1: ("EARLY_PHASE1",),
118 | TrialPhase.PHASE1: ("PHASE1",),
119 | TrialPhase.PHASE2: ("PHASE2",),
120 | TrialPhase.PHASE3: ("PHASE3",),
121 | TrialPhase.PHASE4: ("PHASE4",),
122 | TrialPhase.NOT_APPLICABLE: ("NOT_APPLICABLE",),
123 | }
124 |
125 | OPEN_STATUSES = (
126 | "AVAILABLE",
127 | "ENROLLING_BY_INVITATION",
128 | "NOT_YET_RECRUITING",
129 | "RECRUITING",
130 | )
131 | CLOSED_STATUSES = (
132 | "ACTIVE_NOT_RECRUITING",
133 | "COMPLETED",
134 | "SUSPENDED",
135 | "TERMINATED",
136 | "WITHDRAWN",
137 | )
138 | CTGOV_RECRUITING_STATUS_MAPPING = {
139 | RecruitingStatus.OPEN: OPEN_STATUSES,
140 | RecruitingStatus.CLOSED: CLOSED_STATUSES,
141 | RecruitingStatus.ANY: None,
142 | }
143 |
144 | CTGOV_STUDY_TYPE_MAPPING = {
145 | StudyType.INTERVENTIONAL: ("Interventional",),
146 | StudyType.OBSERVATIONAL: ("Observational",),
147 | StudyType.EXPANDED_ACCESS: ("Expanded Access",),
148 | StudyType.OTHER: ("Other",),
149 | }
150 |
151 | CTGOV_INTERVENTION_TYPE_MAPPING = {
152 | InterventionType.DRUG: ("Drug",),
153 | InterventionType.DEVICE: ("Device",),
154 | InterventionType.BIOLOGICAL: ("Biological",),
155 | InterventionType.PROCEDURE: ("Procedure",),
156 | InterventionType.RADIATION: ("Radiation",),
157 | InterventionType.BEHAVIORAL: ("Behavioral",),
158 | InterventionType.GENETIC: ("Genetic",),
159 | InterventionType.DIETARY: ("Dietary",),
160 | InterventionType.DIAGNOSTIC_TEST: ("Diagnostic Test",),
161 | InterventionType.OTHER: ("Other",),
162 | }
163 |
164 | CTGOV_SPONSOR_TYPE_MAPPING = {
165 | SponsorType.INDUSTRY: ("Industry",),
166 | SponsorType.GOVERNMENT: ("Government",),
167 | SponsorType.ACADEMIC: ("Academic",),
168 | SponsorType.OTHER: ("Other",),
169 | }
170 |
171 | CTGOV_STUDY_DESIGN_MAPPING = {
172 | StudyDesign.RANDOMIZED: ("Randomized",),
173 | StudyDesign.NON_RANDOMIZED: ("Non-Randomized",),
174 | StudyDesign.OBSERVATIONAL: ("Observational",),
175 | }
176 |
177 | CTGOV_DATE_FIELD_MAPPING = {
178 | DateField.LAST_UPDATE: "LastUpdatePostDate",
179 | DateField.STUDY_START: "StartDate",
180 | DateField.PRIMARY_COMPLETION: "PrimaryCompletionDate",
181 | DateField.OUTCOME_POSTING: "ResultsFirstPostDate",
182 | DateField.COMPLETION: "CompletionDate",
183 | DateField.FIRST_POSTING: "StudyFirstPostDate",
184 | DateField.SUBMITTED_DATE: "StudyFirstSubmitDate",
185 | }
186 |
187 | CTGOV_PRIMARY_PURPOSE_MAPPING = {
188 | PrimaryPurpose.TREATMENT: ("Treatment",),
189 | PrimaryPurpose.PREVENTION: ("Prevention",),
190 | PrimaryPurpose.DIAGNOSTIC: ("Diagnostic",),
191 | PrimaryPurpose.SUPPORTIVE_CARE: ("Supportive Care",),
192 | PrimaryPurpose.SCREENING: ("Screening",),
193 | PrimaryPurpose.HEALTH_SERVICES: ("Health Services",),
194 | PrimaryPurpose.BASIC_SCIENCE: ("Basic Science",),
195 | PrimaryPurpose.DEVICE_FEASIBILITY: ("Device Feasibility",),
196 | PrimaryPurpose.OTHER: ("Other",),
197 | }
198 |
199 | CTGOV_AGE_GROUP_MAPPING = {
200 | AgeGroup.CHILD: ("Child",),
201 | AgeGroup.ADULT: ("Adult",),
202 | AgeGroup.SENIOR: ("Older Adult",),
203 | AgeGroup.ALL: None,
204 | }
205 |
206 | # Line of therapy patterns for EligibilityCriteria search
207 | LINE_OF_THERAPY_PATTERNS = {
208 | LineOfTherapy.FIRST_LINE: [
209 | '"first line"',
210 | '"first-line"',
211 | '"1st line"',
212 | '"frontline"',
213 | '"treatment naive"',
214 | '"previously untreated"',
215 | ],
216 | LineOfTherapy.SECOND_LINE: [
217 | '"second line"',
218 | '"second-line"',
219 | '"2nd line"',
220 | '"one prior line"',
221 | '"1 prior line"',
222 | ],
223 | LineOfTherapy.THIRD_LINE_PLUS: [
224 | '"third line"',
225 | '"third-line"',
226 | '"3rd line"',
227 | '"≥2 prior"',
228 | '"at least 2 prior"',
229 | '"heavily pretreated"',
230 | ],
231 | }
232 |
233 | DEFAULT_FORMAT = "csv"
234 | DEFAULT_MARKUP = "markdown"
235 |
236 | SEARCH_FIELDS = [
237 | "NCT Number",
238 | "Study Title",
239 | "Study URL",
240 | "Study Status",
241 | "Brief Summary",
242 | "Study Results",
243 | "Conditions",
244 | "Interventions",
245 | "Sponsor",
246 | "Phases",
247 | "Enrollment",
248 | "Study Type",
249 | "Study Design",
250 | "Start Date",
251 | "Completion Date",
252 | ]
253 |
254 | SEARCH_FIELDS_PARAM = [",".join(SEARCH_FIELDS)]
255 |
256 |
257 | class TrialQuery(BaseModel):
258 | """Parameters for querying clinical trial data from ClinicalTrials.gov."""
259 |
260 | conditions: list[str] | None = Field(
261 | default=None,
262 | description="List of condition terms.",
263 | )
264 | terms: list[str] | None = Field(
265 | default=None,
266 | description="General search terms that don't fit specific categories.",
267 | )
268 | interventions: list[str] | None = Field(
269 | default=None,
270 | description="Intervention names.",
271 | )
272 | lead_sponsor: list[str] | None = Field(
273 | default=None,
274 | description="Lead sponsor organization names to filter by.",
275 | )
276 | recruiting_status: RecruitingStatus | None = Field(
277 | default=None,
278 | description="Study recruitment status. Use 'OPEN' for actively recruiting trials, 'CLOSED' for completed/terminated trials, or 'ANY' for all trials. Common aliases like 'recruiting', 'active', 'enrolling' map to 'OPEN'.",
279 | )
280 | study_type: StudyType | None = Field(
281 | default=None,
282 | description="Type of study.",
283 | )
284 | nct_ids: list[str] | None = Field(
285 | default=None,
286 | description="Clinical trial NCT IDs",
287 | )
288 | lat: float | None = Field(
289 | default=None,
290 | description="Latitude for location search. AI agents should geocode city/location names (e.g., 'Cleveland' → 41.4993, -81.6944) before using this parameter.",
291 | )
292 | long: float | None = Field(
293 | default=None,
294 | description="Longitude for location search. AI agents should geocode city/location names (e.g., 'Cleveland' → 41.4993, -81.6944) before using this parameter.",
295 | )
296 | distance: int | None = Field(
297 | default=None,
298 | description="Distance from lat/long in miles (default: 50 miles if lat/long provided but distance not specified)",
299 | )
300 | min_date: str | None = Field(
301 | default=None,
302 | description="Minimum date for filtering",
303 | )
304 | max_date: str | None = Field(
305 | default=None,
306 | description="Maximum date for filtering",
307 | )
308 | date_field: DateField | None = Field(
309 | default=None,
310 | description="Date field to filter on",
311 | )
312 | phase: TrialPhase | None = Field(
313 | default=None,
314 | description="Trial phase filter",
315 | )
316 | age_group: AgeGroup | None = Field(
317 | default=None,
318 | description="Age group filter",
319 | )
320 | primary_purpose: PrimaryPurpose | None = Field(
321 | default=None,
322 | description="Primary purpose of the trial",
323 | )
324 | intervention_type: InterventionType | None = Field(
325 | default=None,
326 | description="Type of intervention",
327 | )
328 | sponsor_type: SponsorType | None = Field(
329 | default=None,
330 | description="Type of sponsor",
331 | )
332 | study_design: StudyDesign | None = Field(
333 | default=None,
334 | description="Study design",
335 | )
336 | sort: SortOrder | None = Field(
337 | default=None,
338 | description="Sort order for results",
339 | )
340 | next_page_hash: str | None = Field(
341 | default=None,
342 | description="Token to retrieve the next page of results",
343 | )
344 | # New eligibility-focused fields
345 | prior_therapies: list[str] | None = Field(
346 | default=None,
347 | description="Prior therapies to search for in eligibility criteria",
348 | )
349 | progression_on: list[str] | None = Field(
350 | default=None,
351 | description="Therapies the patient has progressed on",
352 | )
353 | required_mutations: list[str] | None = Field(
354 | default=None,
355 | description="Required mutations in eligibility criteria",
356 | )
357 | excluded_mutations: list[str] | None = Field(
358 | default=None,
359 | description="Excluded mutations in eligibility criteria",
360 | )
361 | biomarker_expression: dict[str, str] | None = Field(
362 | default=None,
363 | description="Biomarker expression requirements (e.g., {'PD-L1': '≥50%'})",
364 | )
365 | line_of_therapy: LineOfTherapy | None = Field(
366 | default=None,
367 | description="Line of therapy filter",
368 | )
369 | allow_brain_mets: bool | None = Field(
370 | default=None,
371 | description="Whether to allow trials that accept brain metastases",
372 | )
373 | return_fields: list[str] | None = Field(
374 | default=None,
375 | description="Specific fields to return in the response",
376 | )
377 | page_size: int | None = Field(
378 | default=None,
379 | description="Number of results per page",
380 | ge=1,
381 | le=1000,
382 | )
383 | expand_synonyms: bool = Field(
384 | default=True,
385 | description="Expand condition searches with disease synonyms from MyDisease.info",
386 | )
387 |
388 | @field_validator("recruiting_status", mode="before")
389 | @classmethod
390 | def normalize_recruiting_status(cls, v):
391 | """Normalize common recruiting status aliases to enum values."""
392 | if isinstance(v, str):
393 | v_lower = v.lower()
394 | # Map common aliases
395 | alias_map = {
396 | "recruiting": "OPEN",
397 | "active": "OPEN",
398 | "enrolling": "OPEN",
399 | "closed": "CLOSED",
400 | "completed": "CLOSED",
401 | "terminated": "CLOSED",
402 | }
403 | return alias_map.get(v_lower, v)
404 | return v
405 |
406 | # Field validators for list fields
407 | @model_validator(mode="before")
408 | def convert_list_fields(cls, data):
409 | """Convert string values to lists for list fields."""
410 | if isinstance(data, dict):
411 | for field_name in [
412 | "conditions",
413 | "terms",
414 | "interventions",
415 | "lead_sponsor",
416 | "nct_ids",
417 | "prior_therapies",
418 | "progression_on",
419 | "required_mutations",
420 | "excluded_mutations",
421 | "return_fields",
422 | ]:
423 | if field_name in data and data[field_name] is not None:
424 | data[field_name] = ensure_list(
425 | data[field_name], split_strings=True
426 | )
427 | return data
428 |
429 |
430 | def _inject_ids(
431 | params: dict[str, list[str]], ids: list[str], has_other_filters: bool
432 | ) -> None:
433 | """Inject NCT IDs into params using intersection or id-only semantics.
434 |
435 | Args:
436 | params: The parameter dictionary to modify
437 | ids: List of NCT IDs to inject
438 | has_other_filters: Whether other filters are present
439 | """
440 | ids_csv = ",".join(ids)
441 | if has_other_filters: # intersection path
442 | params["filter.ids"] = [ids_csv]
443 | elif len(ids_csv) < 1800: # pure-ID & small
444 | params["query.id"] = [ids_csv]
445 | else: # pure-ID & large
446 | params["filter.ids"] = [ids_csv]
447 |
448 |
449 | def _build_prior_therapy_essie(therapies: list[str]) -> list[str]:
450 | """Build Essie fragments for prior therapy search."""
451 | fragments = []
452 | for therapy in therapies:
453 | if therapy.strip(): # Skip empty strings
454 | fragment = f'AREA[EligibilityCriteria]("{therapy}" AND (prior OR previous OR received))'
455 | fragments.append(fragment)
456 | return fragments
457 |
458 |
459 | def _build_progression_essie(therapies: list[str]) -> list[str]:
460 | """Build Essie fragments for progression on therapy search."""
461 | fragments = []
462 | for therapy in therapies:
463 | if therapy.strip(): # Skip empty strings
464 | fragment = f'AREA[EligibilityCriteria]("{therapy}" AND (progression OR resistant OR refractory))'
465 | fragments.append(fragment)
466 | return fragments
467 |
468 |
469 | def _build_required_mutations_essie(mutations: list[str]) -> list[str]:
470 | """Build Essie fragments for required mutations."""
471 | fragments = []
472 | for mutation in mutations:
473 | if mutation.strip(): # Skip empty strings
474 | fragment = f'AREA[EligibilityCriteria]("{mutation}")'
475 | fragments.append(fragment)
476 | return fragments
477 |
478 |
479 | def _build_excluded_mutations_essie(mutations: list[str]) -> list[str]:
480 | """Build Essie fragments for excluded mutations."""
481 | fragments = []
482 | for mutation in mutations:
483 | if mutation.strip(): # Skip empty strings
484 | fragment = f'AREA[EligibilityCriteria](NOT "{mutation}")'
485 | fragments.append(fragment)
486 | return fragments
487 |
488 |
489 | def _build_biomarker_expression_essie(biomarkers: dict[str, str]) -> list[str]:
490 | """Build Essie fragments for biomarker expression requirements."""
491 | fragments = []
492 | for marker, expression in biomarkers.items():
493 | if marker.strip() and expression.strip(): # Skip empty values
494 | fragment = (
495 | f'AREA[EligibilityCriteria]("{marker}" AND "{expression}")'
496 | )
497 | fragments.append(fragment)
498 | return fragments
499 |
500 |
501 | def _build_line_of_therapy_essie(line: LineOfTherapy) -> str:
502 | """Build Essie fragment for line of therapy."""
503 | patterns = LINE_OF_THERAPY_PATTERNS.get(line, [])
504 | if patterns:
505 | # Join all patterns with OR within a single AREA block
506 | pattern_str = " OR ".join(patterns)
507 | return f"AREA[EligibilityCriteria]({pattern_str})"
508 | return ""
509 |
510 |
511 | def _build_brain_mets_essie(allow: bool) -> str:
512 | """Build Essie fragment for brain metastases filter."""
513 | if allow is False:
514 | return 'AREA[EligibilityCriteria](NOT "brain metastases")'
515 | return ""
516 |
517 |
518 | async def convert_query(query: TrialQuery) -> dict[str, list[str]]: # noqa: C901
519 | """Convert a TrialQuery object into a dict of query params
520 | for the ClinicalTrials.gov API (v2). Each key maps to one or
521 | more strings in a list, consistent with parse_qs outputs.
522 | """
523 | # Start with required fields
524 | params: dict[str, list[str]] = {
525 | "format": [DEFAULT_FORMAT],
526 | "markupFormat": [DEFAULT_MARKUP],
527 | }
528 |
529 | # Track whether we have other filters (for NCT ID intersection logic)
530 | has_other_filters = False
531 |
532 | # Handle conditions with optional synonym expansion
533 | if query.conditions:
534 | has_other_filters = True
535 | expanded_conditions = []
536 |
537 | if query.expand_synonyms:
538 | # Expand each condition with synonyms
539 | client = BioThingsClient()
540 | for condition in query.conditions:
541 | try:
542 | synonyms = await client.get_disease_synonyms(condition)
543 | expanded_conditions.extend(synonyms)
544 | except Exception as e:
545 | logger.warning(
546 | f"Failed to get synonyms for {condition}: {e}"
547 | )
548 | expanded_conditions.append(condition)
549 | else:
550 | expanded_conditions = query.conditions
551 |
552 | # Remove duplicates while preserving order
553 | seen = set()
554 | unique_conditions = []
555 | for cond in expanded_conditions:
556 | if cond.lower() not in seen:
557 | seen.add(cond.lower())
558 | unique_conditions.append(cond)
559 |
560 | if len(unique_conditions) == 1:
561 | params["query.cond"] = [unique_conditions[0]]
562 | else:
563 | # Join multiple terms with OR, wrapped in parentheses
564 | params["query.cond"] = [f"({' OR '.join(unique_conditions)})"]
565 |
566 | # Handle terms and interventions (no synonym expansion)
567 | for key, val in [
568 | ("query.term", query.terms),
569 | ("query.intr", query.interventions),
570 | ("query.lead", query.lead_sponsor),
571 | ]:
572 | if val:
573 | has_other_filters = True
574 | if len(val) == 1:
575 | params[key] = [val[0]]
576 | else:
577 | # Join multiple terms with OR, wrapped in parentheses
578 | params[key] = [f"({' OR '.join(val)})"]
579 |
580 | # Collect Essie fragments for eligibility criteria
581 | essie_fragments: list[str] = []
582 |
583 | # Prior therapies
584 | if query.prior_therapies:
585 | has_other_filters = True
586 | essie_fragments.extend(
587 | _build_prior_therapy_essie(query.prior_therapies)
588 | )
589 |
590 | # Progression on therapies
591 | if query.progression_on:
592 | has_other_filters = True
593 | essie_fragments.extend(_build_progression_essie(query.progression_on))
594 |
595 | # Required mutations
596 | if query.required_mutations:
597 | has_other_filters = True
598 | essie_fragments.extend(
599 | _build_required_mutations_essie(query.required_mutations)
600 | )
601 |
602 | # Excluded mutations
603 | if query.excluded_mutations:
604 | has_other_filters = True
605 | essie_fragments.extend(
606 | _build_excluded_mutations_essie(query.excluded_mutations)
607 | )
608 |
609 | # Biomarker expression
610 | if query.biomarker_expression:
611 | has_other_filters = True
612 | essie_fragments.extend(
613 | _build_biomarker_expression_essie(query.biomarker_expression)
614 | )
615 |
616 | # Line of therapy
617 | if query.line_of_therapy:
618 | has_other_filters = True
619 | line_fragment = _build_line_of_therapy_essie(query.line_of_therapy)
620 | if line_fragment:
621 | essie_fragments.append(line_fragment)
622 |
623 | # Brain metastases filter
624 | if query.allow_brain_mets is not None:
625 | has_other_filters = True
626 | brain_fragment = _build_brain_mets_essie(query.allow_brain_mets)
627 | if brain_fragment:
628 | essie_fragments.append(brain_fragment)
629 |
630 | # Combine all Essie fragments with AND and append to query.term
631 | if essie_fragments:
632 | combined_essie = " AND ".join(essie_fragments)
633 | if "query.term" in params:
634 | # Append to existing terms with AND
635 | params["query.term"][0] = (
636 | f"{params['query.term'][0]} AND {combined_essie}"
637 | )
638 | else:
639 | params["query.term"] = [combined_essie]
640 |
641 | # Geospatial
642 | if query.lat is not None and query.long is not None:
643 | has_other_filters = True
644 | geo_val = f"distance({query.lat},{query.long},{query.distance}mi)"
645 | params["filter.geo"] = [geo_val]
646 |
647 | # Collect advanced filters in a list
648 | advanced_filters: list[str] = []
649 |
650 | # Date filter
651 | if query.date_field and (query.min_date or query.max_date):
652 | has_other_filters = True
653 | date_field = CTGOV_DATE_FIELD_MAPPING[query.date_field]
654 | min_val = query.min_date or "MIN"
655 | max_val = query.max_date or "MAX"
656 | advanced_filters.append(
657 | f"AREA[{date_field}]RANGE[{min_val},{max_val}]",
658 | )
659 |
660 | # Prepare a map of "AREA[...] -> (query_value, mapping_dict)"
661 | advanced_map = {
662 | "DesignPrimaryPurpose": (
663 | query.primary_purpose,
664 | CTGOV_PRIMARY_PURPOSE_MAPPING,
665 | ),
666 | "StudyType": (query.study_type, CTGOV_STUDY_TYPE_MAPPING),
667 | "InterventionType": (
668 | query.intervention_type,
669 | CTGOV_INTERVENTION_TYPE_MAPPING,
670 | ),
671 | "SponsorType": (query.sponsor_type, CTGOV_SPONSOR_TYPE_MAPPING),
672 | "StudyDesign": (query.study_design, CTGOV_STUDY_DESIGN_MAPPING),
673 | "Phase": (query.phase, CTGOV_PHASE_MAPPING),
674 | }
675 |
676 | # Append advanced filters
677 | for area, (qval, mapping) in advanced_map.items():
678 | if qval:
679 | has_other_filters = True
680 | # Check if mapping is a dict before using get method
681 | mapped = (
682 | mapping.get(qval)
683 | if mapping and isinstance(mapping, dict)
684 | else None
685 | )
686 | # Use the first mapped value if available, otherwise the literal
687 | value = mapped[0] if mapped else qval
688 | advanced_filters.append(f"AREA[{area}]{value}")
689 |
690 | # Age group
691 | if query.age_group and query.age_group != "ALL":
692 | has_other_filters = True
693 | mapped = CTGOV_AGE_GROUP_MAPPING[query.age_group]
694 | if mapped:
695 | advanced_filters.append(f"AREA[StdAge]{mapped[0]}")
696 | else:
697 | advanced_filters.append(f"AREA[StdAge]{query.age_group}")
698 |
699 | # If we collected any advanced filters, join them with AND
700 | if advanced_filters:
701 | params["filter.advanced"] = [" AND ".join(advanced_filters)]
702 |
703 | # NCT IDs - now using intersection semantics
704 | # Must be done BEFORE recruiting status to properly detect user-set filters
705 | if query.nct_ids:
706 | _inject_ids(params, query.nct_ids, has_other_filters)
707 |
708 | # Recruiting status - apply AFTER NCT ID injection
709 | # Only count as a user filter if explicitly set to something other than default
710 | if query.recruiting_status not in (None, RecruitingStatus.OPEN):
711 | # User explicitly set a non-default status
712 | if query.recruiting_status is not None: # Type guard for mypy
713 | statuses = CTGOV_RECRUITING_STATUS_MAPPING.get(
714 | query.recruiting_status
715 | )
716 | if statuses:
717 | params["filter.overallStatus"] = [",".join(statuses)]
718 | elif not query.nct_ids or has_other_filters:
719 | # Apply default OPEN status only if:
720 | # 1. No NCT IDs provided, OR
721 | # 2. NCT IDs provided with other filters (intersection mode)
722 | params["filter.overallStatus"] = [",".join(OPEN_STATUSES)]
723 |
724 | # Sort & paging
725 | if query.sort is None:
726 | sort_val = CTGOV_SORT_MAPPING[SortOrder.RELEVANCE]
727 | else:
728 | sort_val = CTGOV_SORT_MAPPING.get(query.sort, query.sort)
729 |
730 | params["sort"] = [sort_val]
731 | if query.next_page_hash:
732 | params["pageToken"] = [query.next_page_hash]
733 |
734 | # Finally, add fields to limit payload size
735 | if query.return_fields:
736 | # Use custom fields if specified
737 | params["fields"] = [",".join(query.return_fields)]
738 | else:
739 | # Use default fields
740 | params["fields"] = SEARCH_FIELDS_PARAM
741 |
742 | # Set page size
743 | if query.page_size:
744 | params["pageSize"] = [str(query.page_size)]
745 | else:
746 | params["pageSize"] = ["40"]
747 |
748 | return params
749 |
750 |
751 | async def search_trials(
752 | query: TrialQuery,
753 | output_json: bool = False,
754 | ) -> str:
755 | """Search ClinicalTrials.gov for clinical trials."""
756 | params = await convert_query(query)
757 |
758 | # Log filter mode if NCT IDs are present
759 | if query.nct_ids:
760 | # Check if we're using intersection or id-only mode
761 | # Only count explicit user-set filters, not defaults
762 | has_other_filters = any([
763 | query.conditions,
764 | query.terms,
765 | query.interventions,
766 | query.lead_sponsor,
767 | query.lat is not None and query.long is not None,
768 | query.date_field and (query.min_date or query.max_date),
769 | query.primary_purpose,
770 | query.study_type,
771 | query.intervention_type,
772 | query.sponsor_type,
773 | query.study_design,
774 | query.phase,
775 | query.age_group and query.age_group != AgeGroup.ALL,
776 | query.recruiting_status not in (None, RecruitingStatus.OPEN),
777 | query.prior_therapies,
778 | query.progression_on,
779 | query.required_mutations,
780 | query.excluded_mutations,
781 | query.biomarker_expression,
782 | query.line_of_therapy,
783 | query.allow_brain_mets is not None,
784 | ])
785 |
786 | if has_other_filters:
787 | logger.debug(
788 | "Filter mode: intersection (NCT IDs AND other filters)"
789 | )
790 | else:
791 | logger.debug("Filter mode: id-only (NCT IDs only)")
792 |
793 | response, error = await http_client.request_api(
794 | url=CLINICAL_TRIALS_BASE_URL,
795 | request=params,
796 | method="GET",
797 | tls_version=TLSVersion.TLSv1_2,
798 | domain="trial",
799 | )
800 |
801 | data = response
802 | if error:
803 | data = {"error": f"Error {error.code}: {error.message}"}
804 |
805 | if data and not output_json:
806 | return render.to_markdown(data)
807 | else:
808 | return json.dumps(data, indent=2)
809 |
810 |
811 | async def _trial_searcher(
812 | call_benefit: Annotated[
813 | str,
814 | "Define and summarize why this function is being called and the intended benefit",
815 | ],
816 | conditions: Annotated[
817 | list[str] | str | None,
818 | "Condition terms (e.g., 'breast cancer') - list or comma-separated string",
819 | ] = None,
820 | terms: Annotated[
821 | list[str] | str | None,
822 | "General search terms - list or comma-separated string",
823 | ] = None,
824 | interventions: Annotated[
825 | list[str] | str | None,
826 | "Intervention names (e.g., 'pembrolizumab') - list or comma-separated string",
827 | ] = None,
828 | lead_sponsor: Annotated[
829 | list[str] | str | None,
830 | "Lead sponsor organization names (e.g., 'Pfizer', 'National Cancer Institute') - list or comma-separated string",
831 | ] = None,
832 | recruiting_status: Annotated[
833 | RecruitingStatus | str | None,
834 | "Study recruitment status (OPEN, CLOSED, ANY)",
835 | ] = None,
836 | study_type: Annotated[StudyType | str | None, "Type of study"] = None,
837 | nct_ids: Annotated[
838 | list[str] | str | None,
839 | "Clinical trial NCT IDs - list or comma-separated string",
840 | ] = None,
841 | lat: Annotated[
842 | float | None,
843 | "Latitude for location search. AI agents should geocode city/location names (e.g., 'Cleveland' → 41.4993, -81.6944) before using this parameter.",
844 | ] = None,
845 | long: Annotated[
846 | float | None,
847 | "Longitude for location search. AI agents should geocode city/location names (e.g., 'Cleveland' → 41.4993, -81.6944) before using this parameter.",
848 | ] = None,
849 | distance: Annotated[
850 | float | None,
851 | "Distance from lat/long in miles (default: 50 miles if lat/long provided but distance not specified)",
852 | ] = None,
853 | min_date: Annotated[
854 | str | None, "Minimum date for filtering (YYYY-MM-DD)"
855 | ] = None,
856 | max_date: Annotated[
857 | str | None, "Maximum date for filtering (YYYY-MM-DD)"
858 | ] = None,
859 | date_field: Annotated[
860 | DateField | str | None, "Date field to filter on"
861 | ] = None,
862 | phase: Annotated[TrialPhase | str | None, "Trial phase filter"] = None,
863 | age_group: Annotated[AgeGroup | str | None, "Age group filter"] = None,
864 | primary_purpose: Annotated[
865 | PrimaryPurpose | str | None, "Primary purpose of the trial"
866 | ] = None,
867 | intervention_type: Annotated[
868 | InterventionType | str | None, "Type of intervention"
869 | ] = None,
870 | sponsor_type: Annotated[
871 | SponsorType | str | None, "Type of sponsor"
872 | ] = None,
873 | study_design: Annotated[StudyDesign | str | None, "Study design"] = None,
874 | sort: Annotated[SortOrder | str | None, "Sort order for results"] = None,
875 | next_page_hash: Annotated[
876 | str | None, "Token to retrieve the next page of results"
877 | ] = None,
878 | prior_therapies: Annotated[
879 | list[str] | str | None,
880 | "Prior therapies to search for in eligibility criteria - list or comma-separated string",
881 | ] = None,
882 | progression_on: Annotated[
883 | list[str] | str | None,
884 | "Therapies the patient has progressed on - list or comma-separated string",
885 | ] = None,
886 | required_mutations: Annotated[
887 | list[str] | str | None,
888 | "Required mutations in eligibility criteria - list or comma-separated string",
889 | ] = None,
890 | excluded_mutations: Annotated[
891 | list[str] | str | None,
892 | "Excluded mutations in eligibility criteria - list or comma-separated string",
893 | ] = None,
894 | biomarker_expression: Annotated[
895 | dict[str, str] | None,
896 | "Biomarker expression requirements (e.g., {'PD-L1': '≥50%'})",
897 | ] = None,
898 | line_of_therapy: Annotated[
899 | LineOfTherapy | str | None,
900 | "Line of therapy filter (1L, 2L, 3L+)",
901 | ] = None,
902 | allow_brain_mets: Annotated[
903 | bool | None,
904 | "Whether to allow trials that accept brain metastases",
905 | ] = None,
906 | return_fields: Annotated[
907 | list[str] | str | None,
908 | "Specific fields to return in the response - list or comma-separated string",
909 | ] = None,
910 | page_size: Annotated[
911 | int | None,
912 | "Number of results per page (1-1000)",
913 | ] = None,
914 | expand_synonyms: Annotated[
915 | bool,
916 | "Expand condition searches with disease synonyms from MyDisease.info",
917 | ] = True,
918 | ) -> str:
919 | """
920 | Searches for clinical trials based on specified criteria.
921 |
922 | Parameters:
923 | - call_benefit: Define and summarize why this function is being called and the intended benefit
924 | - conditions: Condition terms (e.g., "breast cancer") - list or comma-separated string
925 | - terms: General search terms - list or comma-separated string
926 | - interventions: Intervention names (e.g., "pembrolizumab") - list or comma-separated string
927 | - lead_sponsor: Lead sponsor organization names (e.g., "Pfizer", "National Cancer Institute") - list or comma-separated string
928 | - recruiting_status: Study recruitment status (OPEN, CLOSED, ANY)
929 | - study_type: Type of study
930 | - nct_ids: Clinical trial NCT IDs - list or comma-separated string
931 | - lat: Latitude for location search
932 | - long: Longitude for location search
933 | - distance: Distance from lat/long in miles
934 | - min_date: Minimum date for filtering (YYYY-MM-DD)
935 | - max_date: Maximum date for filtering (YYYY-MM-DD)
936 | - date_field: Date field to filter on
937 | - phase: Trial phase filter
938 | - age_group: Age group filter
939 | - primary_purpose: Primary purpose of the trial
940 | - intervention_type: Type of intervention
941 | - sponsor_type: Type of sponsor
942 | - study_design: Study design
943 | - sort: Sort order for results
944 | - next_page_hash: Token to retrieve the next page of results
945 | - prior_therapies: Prior therapies to search for in eligibility criteria - list or comma-separated string
946 | - progression_on: Therapies the patient has progressed on - list or comma-separated string
947 | - required_mutations: Required mutations in eligibility criteria - list or comma-separated string
948 | - excluded_mutations: Excluded mutations in eligibility criteria - list or comma-separated string
949 | - biomarker_expression: Biomarker expression requirements (e.g., {'PD-L1': '≥50%'})
950 | - line_of_therapy: Line of therapy filter (1L, 2L, 3L+)
951 | - allow_brain_mets: Whether to allow trials that accept brain metastases
952 | - return_fields: Specific fields to return in the response - list or comma-separated string
953 | - page_size: Number of results per page (1-1000)
954 | - expand_synonyms: Expand condition searches with disease synonyms from MyDisease.info
955 |
956 | Returns:
957 | Markdown formatted list of clinical trials
958 | """
959 | # Convert individual parameters to a TrialQuery object
960 | query = TrialQuery(
961 | conditions=ensure_list(conditions, split_strings=True),
962 | terms=ensure_list(terms, split_strings=True),
963 | interventions=ensure_list(interventions, split_strings=True),
964 | lead_sponsor=ensure_list(lead_sponsor, split_strings=True),
965 | recruiting_status=recruiting_status,
966 | study_type=study_type,
967 | nct_ids=ensure_list(nct_ids, split_strings=True),
968 | lat=lat,
969 | long=long,
970 | distance=distance,
971 | min_date=min_date,
972 | max_date=max_date,
973 | date_field=date_field,
974 | phase=phase,
975 | age_group=age_group,
976 | primary_purpose=primary_purpose,
977 | intervention_type=intervention_type,
978 | sponsor_type=sponsor_type,
979 | study_design=study_design,
980 | sort=sort,
981 | next_page_hash=next_page_hash,
982 | prior_therapies=ensure_list(prior_therapies, split_strings=True),
983 | progression_on=ensure_list(progression_on, split_strings=True),
984 | required_mutations=ensure_list(required_mutations, split_strings=True),
985 | excluded_mutations=ensure_list(excluded_mutations, split_strings=True),
986 | biomarker_expression=biomarker_expression,
987 | line_of_therapy=line_of_therapy,
988 | allow_brain_mets=allow_brain_mets,
989 | return_fields=ensure_list(return_fields, split_strings=True),
990 | page_size=page_size,
991 | expand_synonyms=expand_synonyms,
992 | )
993 | return await search_trials(query, output_json=False)
994 |
995 |
996 | async def search_trials_unified(
997 | query: TrialQuery,
998 | source: str = "clinicaltrials",
999 | api_key: str | None = None,
1000 | output_json: bool = False,
1001 | ) -> str:
1002 | """
1003 | Search for clinical trials using either ClinicalTrials.gov or NCI CTS API.
1004 |
1005 | Args:
1006 | query: TrialQuery object with search parameters
1007 | source: Data source - "clinicaltrials" (default) or "nci"
1008 | api_key: API key for NCI (required if source="nci")
1009 | output_json: Return raw JSON instead of formatted markdown
1010 |
1011 | Returns:
1012 | Formatted markdown or JSON string with results
1013 | """
1014 | if source == "nci":
1015 | # Import here to avoid circular imports
1016 | from .nci_search import format_nci_trial_results, search_trials_nci
1017 |
1018 | results = await search_trials_nci(query, api_key)
1019 |
1020 | if output_json:
1021 | return json.dumps(results, indent=2)
1022 | else:
1023 | return format_nci_trial_results(results)
1024 | else:
1025 | # Default to ClinicalTrials.gov
1026 | return await search_trials(query, output_json)
1027 |
```