# Directory Structure ``` ├── .gitignore ├── .python-version ├── pyproject.toml ├── README.md ├── src │ └── pdf_extraction │ ├── __init__.py │ ├── __main__.py │ ├── pdf_extractor.py │ └── server.py └── uv.lock ``` # Files -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- ``` 1 | 3.11 2 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown 1 | # PDF Extraction MCP Server (Claude Code Fork) 2 | 3 | MCP server to extract contents from PDF files, with fixes for Claude Code CLI installation. 4 | 5 | This fork includes critical fixes for installing and running the server with Claude Code (the CLI version). 6 | 7 | ## What's Different in This Fork 8 | 9 | 1. **Added `__main__.py`** - Enables the package to be run as a module with `python -m pdf_extraction` 10 | 2. **Claude Code specific instructions** - Clear installation steps that work with Claude Code CLI 11 | 3. **Tested installation process** - Verified working with `claude mcp add` command 12 | 13 | ## Components 14 | 15 | ### Tools 16 | 17 | The server implements one tool: 18 | - **extract-pdf-contents**: Extract contents from a local PDF file 19 | - Takes `pdf_path` as a required string argument (local file path) 20 | - Takes `pages` as an optional string argument (comma-separated page numbers, supports negative indexing like `-1` for last page) 21 | - Supports both PDF text extraction and OCR for scanned documents 22 | 23 | ## Installation for Claude Code CLI 24 | 25 | ### Prerequisites 26 | 27 | - Python 3.11 or higher 28 | - pip or conda 29 | - Claude Code CLI installed (`claude` command) 30 | 31 | ### Step 1: Clone and Install 32 | 33 | ```bash 34 | # Clone this fork 35 | git clone https://github.com/lh/mcp-pdf-extraction-server.git 36 | cd mcp-pdf-extraction-server 37 | 38 | # Install in development mode 39 | pip install -e . 40 | ``` 41 | 42 | ### Step 2: Find the Installed Command 43 | 44 | ```bash 45 | # Check where pdf-extraction was installed 46 | which pdf-extraction 47 | # Example output: /opt/homebrew/Caskroom/miniconda/base/bin/pdf-extraction 48 | ``` 49 | 50 | ### Step 3: Add to Claude Code 51 | 52 | ```bash 53 | # Add the server using the full path from above 54 | claude mcp add pdf-extraction /opt/homebrew/Caskroom/miniconda/base/bin/pdf-extraction 55 | 56 | # Verify it was added 57 | claude mcp list 58 | ``` 59 | 60 | ### Step 4: Use in Claude 61 | 62 | ```bash 63 | # Start a new Claude session 64 | claude 65 | 66 | # In Claude, type: 67 | /mcp 68 | 69 | # You should see: 70 | # MCP Server Status 71 | # • pdf-extraction: connected 72 | ``` 73 | 74 | ## Usage Example 75 | 76 | Once connected, you can ask Claude to extract PDF contents: 77 | 78 | ``` 79 | "Can you extract the content from the PDF at /path/to/document.pdf?" 80 | 81 | "Extract pages 1-3 and the last page from /path/to/document.pdf" 82 | ``` 83 | 84 | ## Troubleshooting 85 | 86 | ### Server Not Connecting 87 | 88 | 1. Make sure you started a NEW Claude session after adding the server 89 | 2. Verify the command path is correct: `ls -la $(which pdf-extraction)` 90 | 3. Test the command directly (it should hang waiting for input): `pdf-extraction` 91 | 92 | ### Module Not Found Errors 93 | 94 | If you get Python import errors: 95 | 1. Make sure you're using the same Python environment where you installed the package 96 | 2. Try using the full Python path: `claude mcp add pdf-extraction /path/to/python -m pdf_extraction` 97 | 98 | ### Installation Issues 99 | 100 | If `pip install -e .` fails: 101 | 1. Make sure you have Python 3.11+: `python --version` 102 | 2. Try creating a fresh virtual environment: 103 | ```bash 104 | python -m venv venv 105 | source venv/bin/activate # On Windows: venv\Scripts\activate 106 | pip install -e . 107 | ``` 108 | 109 | ## For Claude Desktop Users 110 | 111 | This fork is specifically for Claude Code CLI. If you're using Claude Desktop (the GUI app), please refer to the [original repository](https://github.com/xraywu/mcp-pdf-extraction-server) for installation instructions. 112 | 113 | ## Dependencies 114 | 115 | - mcp>=1.2.0 116 | - pypdf2>=3.0.1 117 | - pytesseract>=0.3.10 (for OCR support) 118 | - Pillow>=10.0.0 119 | - pydantic>=2.10.1,<3.0.0 120 | - pymupdf>=1.24.0 121 | 122 | ## Contributing 123 | 124 | Contributions are welcome! The main change in this fork is the addition of `__main__.py` to make the package runnable as a module. 125 | 126 | ## License 127 | 128 | Same as the original repository. 129 | 130 | ## Credits 131 | 132 | Original server by [@xraywu](https://github.com/xraywu) 133 | Claude Code fixes by [@lh](https://github.com/lh) ``` -------------------------------------------------------------------------------- /src/pdf_extraction/__main__.py: -------------------------------------------------------------------------------- ```python 1 | from . import main 2 | 3 | if __name__ == "__main__": 4 | main() ``` -------------------------------------------------------------------------------- /src/pdf_extraction/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from . import server 2 | import asyncio 3 | 4 | def main(): 5 | """Main entry point for the package.""" 6 | asyncio.run(server.main()) 7 | 8 | # Optionally expose other important items at package level 9 | __all__ = ['main', 'server'] ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml 1 | [project] 2 | name = "pdf-extraction" 3 | version = "0.1.0" 4 | description = "MCP server to extract contents from PDF files" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [ 8 | "mcp>=1.2.0", 9 | "pypdf2>=3.0.1", 10 | "pytesseract>=0.3.10", 11 | "Pillow>=10.0.0", 12 | "pydantic>=2.10.1,<3.0.0", 13 | "pymupdf>=1.24.0" 14 | ] 15 | 16 | [build-system] 17 | requires = [ "hatchling",] 18 | build-backend = "hatchling.build" 19 | 20 | [project.scripts] 21 | pdf-extraction = "pdf_extraction:main" 22 | ``` -------------------------------------------------------------------------------- /src/pdf_extraction/server.py: -------------------------------------------------------------------------------- ```python 1 | from mcp.server.models import InitializationOptions 2 | import mcp.types as types 3 | from mcp.server import NotificationOptions, Server 4 | import mcp.server.stdio 5 | from .pdf_extractor import PDFExtractor 6 | 7 | 8 | # MCP 服务器配置 9 | server = Server("pdf_extraction") 10 | 11 | # MCP 工具配置 12 | @server.list_tools() 13 | async def handle_list_tools() -> list[types.Tool]: 14 | """ 15 | Tools for PDF contents extraction 16 | """ 17 | return [ 18 | types.Tool( 19 | name="extract-pdf-contents", 20 | description="Extract contents from a local PDF file, given page numbers separated in comma. Negative page index number supported.", 21 | inputSchema={ 22 | "type": "object", 23 | "properties": { 24 | "pdf_path": {"type": "string"}, 25 | "pages": {"type": "string"}, 26 | }, 27 | "required": ["pdf_path"], 28 | }, 29 | ) 30 | ] 31 | 32 | @server.call_tool() 33 | async def handle_call_tool( 34 | name: str, arguments: dict | None 35 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: 36 | """ 37 | Tools for PDF content extraction 38 | """ 39 | if name == "extract-pdf-contents": 40 | if not arguments: 41 | raise ValueError("Missing arguments") 42 | 43 | pdf_path = arguments.get("pdf_path") 44 | pages = arguments.get("pages") 45 | 46 | if not pdf_path: 47 | raise ValueError("Missing file path") 48 | 49 | 50 | extractor = PDFExtractor() 51 | extracted_text = extractor.extract_content(pdf_path, pages) 52 | return [ 53 | types.TextContent( 54 | type="text", 55 | text=extracted_text, 56 | ) 57 | ] 58 | else: 59 | raise ValueError(f"Unknown tool: {name}") 60 | 61 | 62 | # 启动主函数 63 | async def main(): 64 | # Run the server using stdin/stdout streams 65 | async with mcp.server.stdio.stdio_server() as (read_stream, write_stream): 66 | await server.run( 67 | read_stream, 68 | write_stream, 69 | InitializationOptions( 70 | server_name="pdf_extraction", 71 | server_version="0.1.0", 72 | capabilities=server.get_capabilities( 73 | notification_options=NotificationOptions(), 74 | experimental_capabilities={}, 75 | ), 76 | ), 77 | ) ``` -------------------------------------------------------------------------------- /src/pdf_extraction/pdf_extractor.py: -------------------------------------------------------------------------------- ```python 1 | from PyPDF2 import PdfReader 2 | from pytesseract import image_to_string 3 | from PIL import Image 4 | import fitz # PyMuPDF 5 | import io 6 | from typing import List, Optional 7 | 8 | 9 | class PDFExtractor: 10 | """PDF内容提取器,支持普通PDF和扫描件""" 11 | 12 | def __init__(self): 13 | pass 14 | 15 | def is_scanned_pdf(self, pdf_path: str) -> bool: 16 | """检查PDF是否为扫描件(图片格式)""" 17 | reader = PdfReader(pdf_path) 18 | for page in reader.pages: 19 | if page.extract_text().strip(): 20 | return False 21 | return True 22 | 23 | def extract_text_from_scanned(self, pdf_path: str, pages: List[int]) -> str: 24 | """使用OCR从扫描件PDF中提取文本""" 25 | doc = fitz.open(pdf_path) 26 | extracted_text = [] 27 | 28 | for page_num in pages: 29 | page = doc.load_page(page_num) 30 | pix = page.get_pixmap() 31 | img = Image.open(io.BytesIO(pix.tobytes())) 32 | 33 | # OCR支持中文和英文 34 | text = image_to_string(img, lang='chi_sim+eng') 35 | extracted_text.append(f"Page {page_num + 1}:\n{text}") 36 | 37 | return "\n\n".join(extracted_text) 38 | 39 | def extract_text_from_normal(self, pdf_path: str, pages: List[int]) -> str: 40 | """从普通PDF中提取文本""" 41 | reader = PdfReader(pdf_path) 42 | extracted_text = [] 43 | 44 | for page_num in pages: 45 | page = reader.pages[page_num] 46 | extracted_text.append(f"Page {page_num + 1}:\n{page.extract_text()}") 47 | 48 | return "\n\n".join(extracted_text) 49 | 50 | def parse_pages(self, pages_str: Optional[str], total_pages: int) -> List[int]: 51 | """解析页码字符串""" 52 | if not pages_str: 53 | return list(range(total_pages)) 54 | 55 | pages = [] 56 | for part in pages_str.split(','): 57 | if not part.strip(): 58 | continue 59 | try: 60 | page_num = int(part.strip()) 61 | if page_num < 0: 62 | page_num = total_pages + page_num 63 | elif page_num > 0: 64 | page_num = page_num - 1 65 | else: 66 | raise ValueError("PDF页码不能为0") 67 | if 0 <= page_num < total_pages: 68 | pages.append(page_num) 69 | except ValueError: 70 | continue 71 | return sorted(set(pages)) 72 | 73 | def extract_content(self, pdf_path: str, pages: Optional[str]) -> List[str]: 74 | """提取PDF内容的主方法""" 75 | if not pdf_path: 76 | raise ValueError("PDF路径不能为空") 77 | 78 | try: 79 | # 检查是否为扫描件 80 | is_scanned = self.is_scanned_pdf(pdf_path) 81 | 82 | # 解析页码 83 | reader = PdfReader(pdf_path) 84 | total_pages = len(reader.pages) 85 | selected_pages = self.parse_pages(pages, total_pages) 86 | 87 | # 根据PDF类型选择提取方式 88 | if is_scanned: 89 | text = self.extract_text_from_scanned(pdf_path, selected_pages) 90 | else: 91 | text = self.extract_text_from_normal(pdf_path, selected_pages) 92 | 93 | return text 94 | except Exception as e: 95 | raise ValueError(f"提取PDF内容失败: {str(e)}") ```