# Directory Structure ``` ├── .gitignore ├── .python-version ├── biorxiv_server.py ├── biorxiv_web_search.py ├── pyproject.toml ├── README.md └── requirements.txt ``` # Files -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- ``` 1 | 3.10 2 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown 1 | # bioRxiv MCP Server 2 | 3 | 🔍 Enable AI assistants to search and access bioRxiv papers through a simple MCP interface. 4 | 5 | The bioRxiv MCP Server provides a bridge between AI assistants and bioRxiv's preprint repository through the Model Context Protocol (MCP). It allows AI models to search for biology preprints and access their metadata in a programmatic way. 6 | 7 | 🤝 Contribute • 📝 Report Bug 8 | 9 | ## ✨ Core Features 10 | - 🔎 Paper Search: Query bioRxiv papers with keywords or advanced search ✅ 11 | - 🚀 Efficient Retrieval: Fast access to paper metadata ✅ 12 | - 📊 Metadata Access: Retrieve detailed metadata for specific papers ✅ 13 | - 📊 Research Support: Facilitate biological sciences research and analysis ✅ 14 | - 📄 Paper Access: Download and read paper content 📝 15 | - 📋 Paper Listing: View all downloaded papers 📝 16 | - 🗃️ Local Storage: Papers are saved locally for faster access 📝 17 | - 📝 Research Prompts: A set of specialized prompts for paper analysis 📝 18 | 19 | ## 🚀 Quick Start 20 | 21 | ### Prerequisites 22 | 23 | - Python 3.10+ 24 | - FastMCP library 25 | 26 | ### Installation 27 | 28 | 1. Clone the repository: 29 | ``` 30 | git clone https://github.com/JackKuo666/bioRxiv-MCP-Server.git 31 | cd bioRxiv-MCP-Server 32 | ``` 33 | 34 | 2. Install the required dependencies: 35 | ``` 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | ### Installing via Smithery 40 | 41 | To install bioRxiv Server for Claude Desktop automatically via [Smithery](https://smithery.ai/server/@JackKuo666/biorxiv-mcp-server): 42 | 43 | #### claude 44 | 45 | ```bash 46 | npx -y @smithery/cli@latest install @JackKuo666/biorxiv-mcp-server --client claude --config "{}" 47 | ``` 48 | 49 | #### Cursor 50 | 51 | Paste the following into Settings → Cursor Settings → MCP → Add new server: 52 | - Mac/Linux 53 | ```s 54 | npx -y @smithery/cli@latest run @JackKuo666/biorxiv-mcp-server --client cursor --config "{}" 55 | ``` 56 | #### Windsurf 57 | ```sh 58 | npx -y @smithery/cli@latest install @JackKuo666/biorxiv-mcp-server --client windsurf --config "{}" 59 | ``` 60 | #### CLine 61 | ```sh 62 | npx -y @smithery/cli@latest install @JackKuo666/biorxiv-mcp-server --client cline --config "{}" 63 | ``` 64 | 65 | #### Usage with Claude Desktop 66 | 67 | Add this configuration to your `claude_desktop_config.json`: 68 | 69 | (Mac OS) 70 | 71 | ```json 72 | { 73 | "mcpServers": { 74 | "biorxiv": { 75 | "command": "python", 76 | "args": ["-m", "biorxiv-mcp-server"] 77 | } 78 | } 79 | } 80 | ``` 81 | 82 | (Windows version): 83 | 84 | ```json 85 | { 86 | "mcpServers": { 87 | "biorxiv": { 88 | "command": "C:\\Users\\YOUR_USERNAME\\AppData\\Local\\Programs\\Python\\Python311\\python.exe", 89 | "args": [ 90 | "-m", 91 | "biorxiv-mcp-server" 92 | ] 93 | } 94 | } 95 | } 96 | ``` 97 | Using with Cline 98 | ```json 99 | { 100 | "mcpServers": { 101 | "biorxiv": { 102 | "command": "bash", 103 | "args": [ 104 | "-c", 105 | "source /home/YOUR/PATH/mcp-server-bioRxiv/.venv/bin/activate && python /home/YOUR/PATH/mcp-server-bioRxiv/biorxiv_server.py" 106 | ], 107 | "env": {}, 108 | "disabled": false, 109 | "autoApprove": [] 110 | } 111 | } 112 | } 113 | ``` 114 | 115 | 116 | ## 📊 Usage 117 | 118 | Start the MCP server: 119 | 120 | ```bash 121 | python biorxiv_server.py 122 | ``` 123 | 124 | ## 🛠 MCP Tools 125 | 126 | The bioRxiv MCP Server provides the following tools: 127 | 128 | 1. `search_biorxiv_key_words`: Search for articles on bioRxiv using keywords. 129 | 2. `search_biorxiv_advanced`: Perform an advanced search for articles on bioRxiv with multiple parameters. 130 | 3. `get_biorxiv_metadata`: Fetch metadata for a bioRxiv article using its DOI. 131 | 132 | ### Searching Papers 133 | 134 | You can ask the AI assistant to search for papers using queries like: 135 | ``` 136 | Can you search bioRxiv for recent papers about genomics? 137 | ``` 138 | 139 | ### Getting Paper Details 140 | 141 | Once you have a DOI, you can ask for more details: 142 | ``` 143 | Can you show me the metadata for the paper with DOI 10.1101/123456? 144 | ``` 145 | 146 | ## 📁 Project Structure 147 | 148 | - `biorxiv_server.py`: The main MCP server implementation using FastMCP 149 | - `biorxiv_web_search.py`: Contains the web scraping logic for searching bioRxiv 150 | 151 | ## 🔧 Dependencies 152 | 153 | - Python 3.10+ 154 | - FastMCP 155 | - asyncio 156 | - logging 157 | 158 | ## 🤝 Contributing 159 | 160 | Contributions are welcome! Please feel free to submit a Pull Request. 161 | 162 | ## 📄 License 163 | 164 | This project is licensed under the MIT License. 165 | 166 | ## ⚠️ Disclaimer 167 | 168 | This tool is for research purposes only. Please respect bioRxiv's terms of service and use this tool responsibly. 169 | ``` -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- ``` 1 | requests 2 | bs4 3 | mcp 4 | ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml 1 | [project] 2 | name = "mcp-server-bioRxiv" 3 | version = "0.1.0" 4 | description = "An MCP server for searching and retrieving articles from bioRxiv" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "mcp[cli]>=1.4.1", 9 | "requests>=2.25.1", 10 | "beautifulsoup4>=4.9.3", 11 | ] 12 | ``` -------------------------------------------------------------------------------- /biorxiv_server.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Any, List, Dict, Optional 2 | import asyncio 3 | import logging 4 | from mcp.server.fastmcp import FastMCP 5 | from biorxiv_web_search import search_key_words, search_advanced, doi_get_biorxiv_metadata 6 | 7 | # Set up logging 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 9 | 10 | # Initialize FastMCP server 11 | mcp = FastMCP("biorxiv") 12 | 13 | @mcp.tool() 14 | async def search_biorxiv_key_words(key_words: str, num_results: int = 10) -> List[Dict[str, Any]]: 15 | logging.info(f"Searching for articles with key words: {key_words}, num_results: {num_results}") 16 | """ 17 | Search for articles on bioRxiv using key words. 18 | 19 | Args: 20 | key_words: Search query string 21 | num_results: Number of results to return (default: 10) 22 | 23 | Returns: 24 | List of dictionaries containing article information 25 | """ 26 | try: 27 | results = await asyncio.to_thread(search_key_words, key_words, num_results) 28 | return results 29 | except Exception as e: 30 | return [{"error": f"An error occurred while searching: {str(e)}"}] 31 | 32 | @mcp.tool() 33 | async def search_biorxiv_advanced( 34 | term: Optional[str] = None, 35 | title: Optional[str] = None, 36 | author1: Optional[str] = None, 37 | author2: Optional[str] = None, 38 | abstract_title: Optional[str] = None, 39 | text_abstract_title: Optional[str] = None, 40 | section: Optional[str] = None, 41 | start_date: Optional[str] = None, 42 | end_date: Optional[str] = None, 43 | num_results: int = 10 44 | ) -> List[Dict[str, Any]]: 45 | logging.info(f"Performing advanced search with parameters: {locals()}") 46 | """ 47 | Perform an advanced search for articles on bioRxiv. 48 | 49 | Args: 50 | term: General search term 51 | title: Search in title 52 | author1: First author 53 | author2: Second author 54 | abstract_title: Search in abstract and title 55 | text_abstract_title: Search in full text, abstract, and title 56 | section: Section of bioRxiv 57 | start_date: Start date for search range (format: YYYY-MM-DD) 58 | end_date: End date for search range (format: YYYY-MM-DD) 59 | num_results: Number of results to return (default: 10) 60 | 61 | Returns: 62 | List of dictionaries containing article information 63 | """ 64 | try: 65 | results = await asyncio.to_thread( 66 | search_advanced, 67 | term, title, author1, author2, abstract_title, text_abstract_title, 68 | section, start_date, end_date, num_results 69 | ) 70 | return results 71 | except Exception as e: 72 | return [{"error": f"An error occurred while performing advanced search: {str(e)}"}] 73 | 74 | @mcp.tool() 75 | async def get_biorxiv_metadata(doi: str) -> Dict[str, Any]: 76 | logging.info(f"Fetching metadata for DOI: {doi}") 77 | """ 78 | Fetch metadata for a bioRxiv article using its DOI. 79 | 80 | Args: 81 | doi: DOI of the article 82 | 83 | Returns: 84 | Dictionary containing article metadata 85 | """ 86 | try: 87 | metadata = await asyncio.to_thread(doi_get_biorxiv_metadata, doi) 88 | return metadata if metadata else {"error": f"No metadata found for DOI: {doi}"} 89 | except Exception as e: 90 | return {"error": f"An error occurred while fetching metadata: {str(e)}"} 91 | 92 | if __name__ == "__main__": 93 | logging.info("Starting bioRxiv MCP server") 94 | # Initialize and run the server 95 | mcp.run(transport='stdio') 96 | ``` -------------------------------------------------------------------------------- /biorxiv_web_search.py: -------------------------------------------------------------------------------- ```python 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from urllib.parse import quote 4 | 5 | def generate_biorxiv_search_url(term=None, title=None, author1=None, author2=None, abstract_title=None, 6 | text_abstract_title=None, journal_code="biorxiv", section=None, 7 | start_date=None, end_date=None, num_results=10, sort="relevance-rank"): 8 | """根据用户输入的字段生成 bioRxiv 搜索 URL""" 9 | 10 | base_url = "https://www.biorxiv.org/search/" 11 | query_parts = [] 12 | if term: 13 | query_parts.append(f"{quote(term)}") 14 | if title: 15 | query_parts.append(f"title%3A{quote(title)} title_flags%3Amatch-all") 16 | if author1: 17 | query_parts.append(f"author1%3A{quote(author1)}") 18 | if author2: 19 | query_parts.append(f"author2%3A{quote(author2)}") 20 | if abstract_title: 21 | query_parts.append(f"abstract_title%3A{quote(abstract_title)} abstract_title_flags%3Amatch-all") 22 | if text_abstract_title: 23 | query_parts.append(f"text_abstract_title%3A{quote(text_abstract_title)} text_abstract_title_flags%3Amatch-all") 24 | if journal_code: 25 | query_parts.append(f"jcode%3A{quote(journal_code)}") 26 | if section: 27 | query_parts.append(f"toc_section%3A{quote(section)}") 28 | if start_date and end_date: 29 | query_parts.append(f"limit_from%3A{start_date} limit_to%3A{end_date}") 30 | 31 | query_parts.append(f"numresults%3A{num_results}") 32 | query_parts.append(f"sort%3A{quote(sort)} format_result%3Astandard") 33 | 34 | return base_url + "%20".join(query_parts) 35 | 36 | def scrape_biorxiv_results(search_url): 37 | """从 bioRxiv 搜索结果页面解析文章信息,包括 DOI""" 38 | headers = { 39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36' 40 | } 41 | response = requests.get(search_url, headers=headers) 42 | 43 | if response.status_code == 200: 44 | soup = BeautifulSoup(response.text, 'html.parser') 45 | articles = soup.find_all('li', class_='search-result') 46 | 47 | results = [] 48 | for article in articles: 49 | title_tag = article.find('span', class_='highwire-cite-title') 50 | title = title_tag.text.strip() if title_tag else "No title" 51 | 52 | authors_tag = article.find('span', class_='highwire-citation-authors') 53 | authors = authors_tag.text.strip() if authors_tag else "No authors" 54 | 55 | abstract_tag = article.find('div', class_='highwire-cite-snippet') 56 | abstract = abstract_tag.text.strip() if abstract_tag else "No abstract" 57 | 58 | link_tag = article.find('a', class_='highwire-cite-linked-title') 59 | link = "https://www.biorxiv.org" + link_tag['href'] if link_tag else "No link" 60 | 61 | doi_tag = article.find('span', class_='highwire-cite-metadata-doi') 62 | doi_link = doi_tag.text.strip().replace("doi:", "").strip() if doi_tag else "No DOI" 63 | 64 | metadata = {} 65 | result = { 66 | "Title": title, 67 | "Authors": authors, 68 | "DOI_link": doi_link, 69 | "Link": link 70 | } 71 | if doi_link != "No DOI": 72 | metadata = doi_get_biorxiv_metadata(doi_link.replace("https://doi.org/", "")) 73 | if metadata: 74 | result.update(metadata) 75 | 76 | results.append(result) 77 | 78 | return results 79 | else: 80 | print(f"Error: Unable to fetch data (status code: {response.status_code})") 81 | return None 82 | 83 | def doi_get_biorxiv_metadata(doi, server="biorxiv"): 84 | """使用 bioRxiv API 通过 DOI 获取文章的详细元数据""" 85 | url = f"https://api.biorxiv.org/details/{server}/{doi}/na/json" 86 | headers = { 87 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36' 88 | } 89 | 90 | response = requests.get(url, headers=headers) 91 | 92 | if response.status_code == 200: 93 | data = response.json() 94 | if 'collection' in data and len(data['collection']) > 0: 95 | article = data['collection'][0] 96 | return { 97 | "DOI": article.get("doi", "No DOI"), 98 | "Title": article.get("title", "No title"), 99 | "Authors": article.get("authors", "No authors"), 100 | "Corresponding Author": article.get("author_corresponding", "No corresponding author"), 101 | "Corresponding Institution": article.get("author_corresponding_institution", "No institution"), 102 | "Date": article.get("date", "No date"), 103 | "Version": article.get("version", "No version"), 104 | "Category": article.get("category", "No category"), 105 | "JATS XML Path": article.get("jats xml path", "No XML path"), 106 | "Abstract": article.get("abstract", "No abstract") 107 | } 108 | else: 109 | print("No data found for DOI:", doi) 110 | return None 111 | else: 112 | print(f"Error: Unable to fetch metadata (status code: {response.status_code})") 113 | return None 114 | 115 | def search_key_words(key_words, num_results=10): 116 | # 生成搜索 URL 117 | search_url = generate_biorxiv_search_url(term=key_words, num_results=num_results) 118 | 119 | print("Generated URL:", search_url) 120 | 121 | # 获取并解析搜索结果 122 | articles = scrape_biorxiv_results(search_url) 123 | 124 | return articles 125 | 126 | 127 | def search_advanced(term, title, author1, author2, abstract_title, text_abstract_title, section, start_date, end_date, num_results): 128 | # 生成搜索 URL 129 | search_url = generate_biorxiv_search_url(term, title=title, author1=author1, author2=author2, 130 | abstract_title=abstract_title, 131 | text_abstract_title=text_abstract_title, 132 | section=section, start_date=start_date, 133 | end_date=end_date, num_results=num_results) 134 | 135 | print("Generated URL:", search_url) 136 | 137 | # 获取并解析搜索结果 138 | articles = scrape_biorxiv_results(search_url) 139 | 140 | return articles 141 | 142 | 143 | 144 | if __name__ == "__main__": 145 | # 1. search_key_words 146 | key_words = "COVID-19" 147 | articles = search_key_words(key_words, num_results=5) 148 | print(articles) 149 | 150 | # 2. search_advanced 151 | # 示例:用户输入搜索参数 152 | term = "CRISPR" 153 | title = "CRISPR" 154 | author1 = "Doudna" 155 | author2 = None 156 | abstract_title = "genome" 157 | text_abstract_title = None 158 | section = "New Results" 159 | start_date = "2025-02-27" 160 | end_date = "2025-03-18" 161 | num_results = 5 162 | articles = search_advanced(term, title, author1, author2, abstract_title, text_abstract_title, section, start_date, end_date, num_results) 163 | print(articles) 164 | 165 | # 3. doi get biorxiv metadata 166 | doi = "10.1101/2024.06.25.600517" 167 | metadata = doi_get_biorxiv_metadata(doi) 168 | print(metadata) 169 | ```