# Directory Structure ``` ├── .gitignore ├── .python-version ├── .vscode │ └── settings.json ├── cursor-rule-example.mdc ├── data │ └── .gitignore ├── LICENSE ├── pdf_converter.py ├── pyproject.toml ├── README.md ├── server.py └── uv.lock ``` # Files -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- ``` 3.12.10 ``` -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- ``` * !.gitignore ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` # Python-generated files __pycache__/ *.py[oc] build/ dist/ wheels/ *.egg-info # Virtual environments .venv .embeddings/ .DS_Store .env mlruns/ mlartifacts/ .ruff_cache/ ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown # MCP PDF Reader A Model Context Protocol (MCP) server that provides tools for reading and processing PDF documents. Built with Docling for document conversion and text extraction. ## Features - **MCP Server** with tools for PDF document processing - **Document Text Extraction**: Convert PDF content to clean Markdown format - **Document Discovery**: List and access available PDF files ## Tools The server provides two main tools: - **`get_document_list`**: Returns a list of all available PDF files in the data directory - **`get_document_text`**: Extracts and returns the full text content of a specified PDF file in Markdown format ## Install Make sure you have [`uv` installed](https://docs.astral.sh/uv/getting-started/installation/). Clone the repository: ```bash git clone [email protected]:mlexpertio/mcp-pdf-reader.git cd mcp-pdf-reader ``` Install Python: ```bash uv python install 3.12.10 ``` Create and activate a virtual environment: ```bash uv venv source .venv/bin/activate ``` Install dependencies: ```bash uv sync ``` ## Usage ### Add PDF Documents Place your PDF files in the `data/` directory. The server will automatically detect and make them available through the tools. ### Run MCP Server Start the MCP server: ```bash python server.py ``` The server runs using stdio transport and can be integrated with any MCP-compatible client. ### Development and Testing Use the MCP inspector to test the server: ```bash mcp dev server.py ``` This will open a web interface where you can test the available tools and inspect their responses. ## Use in VSCode/Cursor You can use the MCP integration in your editor. `Tools & Integrations` -> `New MCP Server` and edit the `mcp.json` file to include the following: ```json { "mcpServers": { "pdf-reader": { "command": "/opt/homebrew/bin/uv", // path to your uv binary "args": ["run", "--directory", "PATH_TO_YOUR_PROJECT", "server.py"] } } } ``` ## License See LICENSE file for details. ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml [project] name = "mcp-pdf-reader" version = "0.1.0" description = "Local MCP server that extract text from PDF files" readme = "README.md" requires-python = ">=3.12.10" dependencies = [ "docling>=2.43.0", "mcp[cli]>=1.12.3", "pypdfium2>=4.30.0", ] [dependency-groups] dev = [ "ruff>=0.12.7", ] ``` -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- ```json { "[python]": { "editor.formatOnSave": true, "editor.defaultFormatter": "charliermarsh.ruff", "editor.codeActionsOnSave": { "source.fixAll": "explicit", "source.organizeImports": "explicit" } }, "notebook.formatOnSave.enabled": true, "notebook.codeActionsOnSave": { "notebook.source.fixAll": "explicit", "notebook.source.organizeImports": "explicit", }, "terminal.integrated.fontFamily": "MesloLGS NF", "terminal.integrated.fontLigatures.enabled": true, "jupyter.interactiveWindow.textEditor.executeSelection": true, "python.analysis.autoImportCompletions": true, "python.analysis.typeCheckingMode": "basic" } ``` -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- ```python import os from pathlib import Path from mcp.server.fastmcp import FastMCP from pdf_converter import convert_to_markdown, create_pdf_converter mcp = FastMCP() APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent)) DATA_DIR = APP_HOME / "data" doc_converter = create_pdf_converter() @mcp.tool() def get_document_text(filename: str) -> str: """ Use this tool to get the content of a document given its filename. Args: filename (str): The filename of the document to get. Returns: str: The content of the document in Markdown format. """ doc_path = DATA_DIR / filename return convert_to_markdown(doc_path, doc_converter) @mcp.tool() def get_document_list() -> list[str]: """ Use this tool to get the list of documents. Returns: list[str]: The list of document filenames. """ return sorted([str(path.name) for path in DATA_DIR.glob("*.pdf")]) if __name__ == "__main__": mcp.run(transport="stdio") ``` -------------------------------------------------------------------------------- /pdf_converter.py: -------------------------------------------------------------------------------- ```python from pathlib import Path from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions from docling.document_converter import DocumentConverter, PdfFormatOption def create_pdf_converter() -> DocumentConverter: return DocumentConverter( allowed_formats=[InputFormat.PDF], format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=PdfPipelineOptions( do_ocr=False, do_table_structure=False, table_structure_options=TableStructureOptions( do_cell_matching=False ), ), backend=PyPdfiumDocumentBackend, ) }, ) def convert_to_markdown(pdf_path: Path, converter: DocumentConverter) -> str: document = converter.convert(pdf_path).document return document.export_to_markdown() ```