# Directory Structure ``` ├── .github │ ├── dependabot.yaml │ └── workflows │ └── build.yml ├── .gitignore ├── docker-compose.yml ├── docs │ ├── images │ │ └── web-usage.png │ ├── search.md │ └── web.md ├── LICENSE ├── README.md └── server ├── .dockerignore ├── .python-version ├── Dockerfile ├── entrypoint.sh ├── mcp_server │ ├── __init__.py │ ├── __main__.py │ ├── server.py │ └── tools │ ├── __init__.py │ ├── helpers.py │ ├── search.py │ └── web.py ├── pyproject.toml ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_server.py │ └── test_web.py ├── tools.yaml └── uv.lock ``` # Files -------------------------------------------------------------------------------- /server/.python-version: -------------------------------------------------------------------------------- ``` 1 | 3.13 2 | ``` -------------------------------------------------------------------------------- /server/.dockerignore: -------------------------------------------------------------------------------- ``` 1 | .dockerignore 2 | .git 3 | .github 4 | .gitignore 5 | .venv 6 | .ruff_cache 7 | .pytest_cache 8 | __pycache__ 9 | docs 10 | Dockerfile 11 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .ruff_cache/ 4 | 5 | # Environments 6 | .env 7 | .venv 8 | env/ 9 | venv/ 10 | ENV/ 11 | env.bak/ 12 | venv.bak/ 13 | 14 | # Temporary logs 15 | *.log 16 | 17 | # Sphinx documentation 18 | docs/_build/ 19 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown 1 | # MCP Server 2 | 3 | Give your AI assistants the power to help you more effectively. This server lets them safely access 4 | websites and search the web - with clear feedback about what's happening and helpful error messages 5 | when things go wrong. 6 | 7 | - [🛠️ What tools does this server offer?](#️-what-tools-does-this-server-offer) 8 | - [🏎️ How can I run it?](#️-how-can-i-run-it) 9 | - [🐋 Using Docker (recommended)](#-using-docker-recommended) 10 | - [💻 Running locally](#-running-locally) 11 | - [🔌 How to connect](#-how-to-connect) 12 | - [📚 Learn more about MCP](#-learn-more-about-mcp) 13 | - [📄 License](#-license) 14 | 15 | ## 🛠️ What tools does this server offer? 16 | 17 | The server provides two powerful tools that help AI assistants solve real-world problems: 18 | 19 | | Tool | What it can do | 20 | | ------------------ | --------------------------------------------------------------------------- | 21 | | [Search](docs/search.md) | Search the web via SearXNG for current information, specific resources, or to perform calculations. | 22 | | [Web](docs/web.md) | Access websites and process their content. Can convert pages to markdown for easy reading, get the raw content, or extract links. | 23 | 24 | ## 🏎️ How can I run it? 25 | 26 | ### 🐋 Using Docker (recommended) 27 | 28 | The server runs in Docker containers to keep things safe and simple. Here's how to get started: 29 | 30 | 1. [Install Docker](https://docs.docker.com/engine/install/) if you haven't already 31 | 2. Create a file called `docker-compose.yml` with: 32 | 33 | ```yaml:docker-compose.yml 34 | services: 35 | mcp-server: 36 | environment: 37 | # Required: URL for your SearXNG instance's Search API 38 | - SEARXNG_QUERY_URL=http://searxng:8080 39 | # Optional: Configure network mode (SSE) for LibreChat etc. 40 | - SSE_HOST=0.0.0.0 41 | - SSE_PORT=8080 42 | # Optional: Set a custom User-Agent for web requests 43 | - USER_AGENT=MCP-Server/1.0 (github.com/tcpipuk/mcp-server) 44 | image: ghcr.io/tcpipuk/mcp-server/server:latest 45 | ports: # Only needed if using SSE_HOST/SSE_PORT 46 | - "8080:8080" # Expose port 8080 on host 47 | restart: unless-stopped 48 | stop_grace_period: 1s 49 | 50 | # Example SearXNG service (optional, adapt as needed) 51 | # searxng: 52 | # environment: 53 | # - SEARXNG_BASE_URL=http://searxng:8080 # Ensure SearXNG knows its own URL 54 | # image: searxng/searxng:latest 55 | # restart: unless-stopped 56 | # volumes: 57 | # - ./searxng:/etc/searxng:rw 58 | ``` 59 | 60 | > **Important**: You *must* provide the `SEARXNG_QUERY_URL` environment variable, pointing to 61 | > the Search API endpoint of your SearXNG instance (usually ending in `/` or `/search`). 62 | > 63 | > Setting `SSE_HOST` and `SSE_PORT` enables network mode (Server-Sent Events), recommended for 64 | > multi-container setups like LibreChat. If omitted, the server uses standard I/O. 65 | 66 | 3. Run `docker compose up -d` to start the server container (and optionally SearXNG). 67 | 68 | Most people use this with either: 69 | 70 | - [Claude Desktop](https://modelcontextprotocol.io/quickstart/user) - connects directly via stdio 71 | (omit `SSE_HOST`/`SSE_PORT` in `docker-compose.yml`). 72 | - [LibreChat](https://www.librechat.ai/docs/local) - connects over the network via SSE. 73 | 74 | For LibreChat, add this to your `librechat.yaml` (assuming `SSE_PORT=8080`): 75 | 76 | ```yaml:librechat.yaml 77 | mcpServers: 78 | mcp-server: 79 | iconPath: "/path/to/icon.png" # Optional: Custom icon 80 | label: "MCP Web/Search" # Optional: Custom label shown in UI 81 | type: sse 82 | url: http://mcp-server:8080/sse # Adjust host/port if needed 83 | ``` 84 | 85 | ### 💻 Running locally 86 | 87 | 1. Install `uv` (requires Python 3.13+): 88 | 89 | ```bash 90 | curl -LsSf https://astral.sh/uv/install.sh | sh 91 | ``` 92 | 93 | > **Note:** If you already have `uv` installed, update it with `uv self update`. 94 | 95 | 2. Create and activate a virtual environment: 96 | 97 | ```bash 98 | uv venv 99 | source .venv/bin/activate # Linux/macOS 100 | # or 101 | .venv\Scripts\activate # Windows 102 | ``` 103 | 104 | 3. Install dependencies from the lockfile: 105 | 106 | ```bash 107 | uv sync 108 | ``` 109 | 110 | 4. Set required environment variables: 111 | 112 | ```bash 113 | # Required: URL for your SearXNG instance's Search API 114 | export SEARXNG_QUERY_URL="http://your-searxng-instance.local:8080" 115 | # Optional: Custom User-Agent 116 | export USER_AGENT="CustomAgent/1.0" 117 | ``` 118 | 119 | 5. Run the server: 120 | 121 | ```bash 122 | # For network (SSE) mode (e.g., for LibreChat) 123 | mcp-server --sse-host 0.0.0.0 --sse-port 3001 124 | 125 | # For direct stdio mode (e.g., for Claude Desktop) 126 | mcp-server 127 | ``` 128 | 129 | Available arguments: 130 | 131 | - `--sse-host`: SSE listening address (e.g., `0.0.0.0`). Enables SSE mode. 132 | - `--sse-port`: SSE listening port (e.g., `3001`). Enables SSE mode. 133 | - `--user-agent`: Custom User-Agent string (overrides `USER_AGENT` env var). 134 | 135 | > **Note**: If neither `--sse-host` nor `--sse-port` are provided (and `SSE_HOST`/`SSE_PORT` env 136 | > vars are not set), the server defaults to `stdio` mode. The `SEARXNG_QUERY_URL` environment 137 | > variable is *always* required. 138 | 139 | ## 🔌 How to connect 140 | 141 | You can connect to the server in two ways: 142 | 143 | | Method | What it means | When to use it | 144 | | ------------------------- | ------------------------------------------------------- | ----------------------------------------------- | 145 | | Network connection (SSE) | The server listens on a network port for connections. | Best for LibreChat or other networked clients. | 146 | | Direct connection (stdio) | The server communicates directly via standard input/out. | Useful for local testing or Claude Desktop. | 147 | 148 | ## 📚 Learn more about MCP 149 | 150 | Here are a few resources to get you started: 151 | 152 | - [MCP Specification](https://spec.modelcontextprotocol.io/) 153 | - [MCP Python SDK](https://github.com/modelcontextprotocol/python-sdk) 154 | - [MCP Example Servers](https://github.com/modelcontextprotocol/servers) 155 | 156 | ## 📄 License 157 | 158 | This project is licensed under the GPLv3. See the [LICENSE](LICENSE) file for full details. 159 | ``` -------------------------------------------------------------------------------- /server/tests/__init__.py: -------------------------------------------------------------------------------- ```python 1 | """Test suite for MCP Server.""" 2 | ``` -------------------------------------------------------------------------------- /server/entrypoint.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | set -e 3 | 4 | # Then run the main command 5 | if [ "$BUILD_ENV" = "dev" ]; then 6 | pytest -v --log-cli-level=INFO tests/ 7 | else 8 | exec mcp-server 9 | fi 10 | ``` -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- ```yaml 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | 8 | - package-ecosystem: "docker" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | 13 | - package-ecosystem: "github-actions" 14 | directory: "/" 15 | schedule: 16 | interval: "weekly" 17 | ``` -------------------------------------------------------------------------------- /server/mcp_server/__init__.py: -------------------------------------------------------------------------------- ```python 1 | """MCP Fetch Server module for handling web content retrieval. 2 | 3 | This module provides HTTP fetching capabilities for the Model Context Protocol (MCP) framework, 4 | allowing models to retrieve and process web content in a controlled manner. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from .__main__ import main 10 | from .server import MCPServer 11 | 12 | __all__ = ["MCPServer", "main"] 13 | ``` -------------------------------------------------------------------------------- /server/mcp_server/tools/__init__.py: -------------------------------------------------------------------------------- ```python 1 | """Tools submodule package for mcp_server. 2 | 3 | Provides tools that let AI assistants safely interact with external systems: 4 | 5 | - search: Use SearXNG's search API to find information on the web 6 | - web: Access and process web content with support for markdown conversion and link extraction 7 | 8 | Each tool is designed to handle errors gracefully and provide clear feedback to help AI 9 | assistants solve problems independently. 10 | """ 11 | 12 | from __future__ import annotations 13 | 14 | from .search import tool_search 15 | from .web import tool_web 16 | 17 | __all__ = ["tool_search", "tool_web"] 18 | ``` -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- ```yaml 1 | services: 2 | mcp-server: 3 | build: 4 | context: ./server 5 | dockerfile: Dockerfile 6 | environment: 7 | - SSE_HOST=0.0.0.0 8 | - SSE_PORT=8080 9 | - SANDBOX_SOCKET=/run/sandbox/shell.sock 10 | - USER_AGENT=CustomAgent/1.0 11 | volumes: 12 | - sandbox_sockets:/run/sandbox 13 | image: ghcr.io/tcpipuk/mcp-server:latest 14 | networks: 15 | - mcp_net 16 | restart: unless-stopped 17 | stop_grace_period: 1s 18 | 19 | sandbox: 20 | build: 21 | context: ./sandbox 22 | dockerfile: Dockerfile 23 | environment: 24 | - SANDBOX_SOCKET=/run/sandbox/shell.sock 25 | image: ghcr.io/tcpipuk/mcp-sandbox:latest 26 | volumes: 27 | - sandbox_home:/home/sandbox 28 | - sandbox_sockets:/run/sandbox 29 | networks: 30 | - mcp_net 31 | restart: unless-stopped 32 | 33 | volumes: 34 | sandbox_home: 35 | ``` -------------------------------------------------------------------------------- /server/tests/conftest.py: -------------------------------------------------------------------------------- ```python 1 | """Configure pytest for the test suite.""" 2 | 3 | from __future__ import annotations 4 | 5 | from asyncio import create_subprocess_exec, sleep as asyncio_sleep 6 | from os import setsid as os_setsid 7 | from typing import TYPE_CHECKING 8 | 9 | import pytest 10 | import pytest_asyncio 11 | 12 | if TYPE_CHECKING: 13 | from collections.abc import AsyncGenerator 14 | 15 | 16 | @pytest.fixture(autouse=True) 17 | def _setup_test_env() -> None: 18 | """Set up test environment variables and cleanup.""" 19 | 20 | 21 | @pytest_asyncio.fixture 22 | async def sandbox_server(unused_tcp_port: int) -> AsyncGenerator[tuple[str, int]]: 23 | """Create a socat-based TCP server for sandbox testing. 24 | 25 | Yields: 26 | Tuple of (host, port) for the test server 27 | """ 28 | # Start socat in the background, echoing input back 29 | process = await create_subprocess_exec( 30 | "/usr/bin/socat", 31 | f"TCP-LISTEN:{unused_tcp_port},reuseaddr,fork", 32 | "EXEC:'bash -i',pty,stderr,setsid,sigint,sane", 33 | preexec_fn=os_setsid, 34 | ) 35 | 36 | # Give socat a moment to start up 37 | await asyncio_sleep(0.2) 38 | 39 | try: 40 | yield "127.0.0.1", unused_tcp_port 41 | finally: 42 | process.terminate() 43 | await process.wait() 44 | ``` -------------------------------------------------------------------------------- /server/Dockerfile: -------------------------------------------------------------------------------- ```dockerfile 1 | # Build stage using uv with a frozen lockfile and dependency caching 2 | FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim AS uv 3 | WORKDIR /app 4 | 5 | # Enable bytecode compilation and copy mode 6 | ENV UV_COMPILE_BYTECODE=1 \ 7 | UV_LINK_MODE=copy 8 | 9 | # Install dependencies using the lockfile and settings 10 | COPY pyproject.toml uv.lock ./ 11 | RUN --mount=type=cache,target=/root/.cache/uv \ 12 | uv sync --frozen --no-install-project ${BUILD_ENV:+"--dev"} --no-editable 13 | 14 | # Add the source code and install main project dependencies 15 | COPY . . 16 | RUN --mount=type=cache,target=/root/.cache/uv \ 17 | uv sync --frozen ${BUILD_ENV:+"--dev"} --no-editable 18 | 19 | # Prepare runtime image 20 | FROM python:3.13-slim-bookworm AS runtime 21 | WORKDIR /app 22 | 23 | # Set default build environment 24 | ARG BUILD_ENV=prod 25 | 26 | # Install minimal system dependencies and create runtime user 27 | RUN apt-get update \ 28 | && apt-get install -y --no-install-recommends socat \ 29 | && rm -rf /var/lib/apt/lists/* \ 30 | && groupadd -g 1000 appuser \ 31 | && useradd -u 1000 -g 1000 -m appuser 32 | 33 | # Copy only necessary files from build stage 34 | COPY --from=uv --chown=appuser:appuser /app/ . 35 | 36 | # Switch to non-root user 37 | 38 | # Set environment variables for runtime 39 | USER appuser 40 | ENV PATH="/app/.venv/bin:$PATH" \ 41 | PYTHONDONTWRITEBYTECODE=1 \ 42 | PYTHONUNBUFFERED=1 43 | 44 | # Use wrapper script to handle startup 45 | ENTRYPOINT ["/app/entrypoint.sh"] 46 | ``` -------------------------------------------------------------------------------- /server/pyproject.toml: -------------------------------------------------------------------------------- ```toml 1 | [project] 2 | name = "mcp-server" 3 | version = "0.1.0" 4 | description = "Provides tools to clients over the Model Context Protocol, supporting both stdio and SSE" 5 | requires-python = ">=3.13" 6 | authors = [{ name = "Tom Foster" }] 7 | maintainers = [{ name = "Tom Foster", email = "[email protected]" }] 8 | keywords = ["http", "mcp", "llm", "automation"] 9 | license = { text = "GPLv3" } 10 | classifiers = [ 11 | "Development Status :: 4 - Beta", 12 | "Intended Audience :: Developers", 13 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 14 | "Programming Language :: Python :: 3", 15 | "Programming Language :: Python :: 3.13", 16 | ] 17 | dependencies = [ 18 | "aiohttp>=3.11.12", 19 | "beautifulsoup4>=4.13.3", 20 | "mcp>=1.2.1", 21 | "pyyaml>=6.0.2", 22 | "trafilatura>=2.0.0", 23 | "uvicorn>=0.34.0", 24 | ] 25 | 26 | [project.scripts] 27 | mcp-server = "mcp_server:main" 28 | 29 | [build-system] 30 | requires = ["hatchling"] 31 | build-backend = "hatchling.build" 32 | 33 | [tool.pytest.ini_options] 34 | addopts = "-ra -v" 35 | asyncio_mode = "strict" 36 | asyncio_default_fixture_loop_scope = "session" 37 | cache_dir = "/tmp/.pytest_cache" 38 | filterwarnings = [ 39 | "ignore:assertions not in test modules or plugins will be ignored:pytest.PytestConfigWarning", 40 | ] 41 | testpaths = "tests" 42 | 43 | [tool.ruff] 44 | cache-dir = "/tmp/.cache/ruff" 45 | fix = true 46 | line-length = 110 47 | target-version = "py313" 48 | unsafe-fixes = true 49 | 50 | [tool.ruff.format] 51 | skip-magic-trailing-comma = true 52 | 53 | [tool.ruff.lint] 54 | select = ["ALL"] 55 | ignore = ["COM812", "CPY", "D203", "D213", "FBT", "RUF029"] 56 | 57 | [tool.ruff.lint.isort] 58 | combine-as-imports = true 59 | required-imports = ["from __future__ import annotations"] 60 | split-on-trailing-comma = false 61 | 62 | [tool.ruff.lint.per-file-ignores] 63 | "tests/*" = ["ARG001"] 64 | 65 | [tool.ruff.lint.pydocstyle] 66 | convention = "google" 67 | 68 | [tool.uv] 69 | dev-dependencies = [ 70 | "psutil>=7.0.0", 71 | "pytest>=8.3.4", 72 | "pytest-asyncio>=0.25.3", 73 | "ruff>=0.9.6", 74 | ] 75 | ``` -------------------------------------------------------------------------------- /server/mcp_server/__main__.py: -------------------------------------------------------------------------------- ```python 1 | """Command-line entry point for the MCP fetch server. 2 | 3 | Provides configuration options for running the fetch server, including customisation 4 | of the User-Agent string for HTTP requests. The server runs asynchronously to handle 5 | concurrent requests efficiently. 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from argparse import ArgumentParser 11 | from asyncio import CancelledError, run as asyncio_run 12 | from contextlib import suppress as contextlib_suppress 13 | from os import environ as os_environ 14 | from pathlib import Path 15 | 16 | from yaml import safe_load as yaml_safe_load 17 | 18 | from .server import MCPServer 19 | from .tools import tool_search, tool_web 20 | 21 | 22 | def main() -> None: 23 | """Provide command-line entrypoint for the MCP fetch server.""" 24 | parser = ArgumentParser(description="Give your LLM access to external tools") 25 | parser.add_argument("--sandbox", type=str, help="TCP host:port pair (e.g. mcp-sandbox:8080)") 26 | parser.add_argument("--sse-host", type=str, help="SSE listening address (e.g. 0.0.0.0)") 27 | parser.add_argument("--sse-port", type=int, help="SSE listening port (e.g. 3001)") 28 | parser.add_argument("--user-agent", type=str, help="Custom User-Agent string") 29 | parser.add_argument("--searxng-query-url", type=str, help="URL for SearXNG search endpoint") 30 | args = parser.parse_args() 31 | 32 | if args.sandbox: 33 | os_environ["SANDBOX"] = args.sandbox 34 | if args.sse_host: 35 | os_environ["SSE_HOST"] = args.sse_host 36 | if args.sse_port: 37 | os_environ["SSE_PORT"] = str(args.sse_port) 38 | if args.user_agent: 39 | os_environ["USER_AGENT"] = args.user_agent 40 | if args.searxng_query_url: 41 | os_environ["SEARXNG_QUERY_URL"] = args.searxng_query_url 42 | 43 | config = yaml_safe_load(Path("tools.yaml").read_text(encoding="utf-8")) 44 | config["tools"]["search"]["method"] = tool_search 45 | config["tools"]["web"]["method"] = tool_web 46 | # Remove the sandbox tool if there's no sandbox 47 | if not os_environ.get("SANDBOX") and "sandbox" in config["tools"]: 48 | del config["tools"]["sandbox"] 49 | server = MCPServer(config) 50 | with contextlib_suppress(KeyboardInterrupt, CancelledError): 51 | asyncio_run(server.serve()) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | ``` -------------------------------------------------------------------------------- /docs/search.md: -------------------------------------------------------------------------------- ```markdown 1 | # Search Tool 2 | 3 | - [Capabilities](#capabilities) 4 | - [Refining the Search](#refining-the-search) 5 | - [The Query (`q`)](#the-query-q) 6 | - [Filtering by Time (`time_range`)](#filtering-by-time-time_range) 7 | - [Content Safety (`safesearch`)](#content-safety-safesearch) 8 | - [Technical Details](#technical-details) 9 | 10 | Provides the AI assistant with web search capabilities via a SearXNG instance. It allows the AI to 11 | fetch current information, look up specific resources, and perform other search-related tasks. 12 | 13 | ## Capabilities 14 | 15 | The tool enables the AI assistant to perform tasks requiring external information lookup, such as: 16 | 17 | - Finding details on current events or recent developments. 18 | - Retrieving specific technical documentation or code examples. 19 | - Searching for various online content types (e.g., images, news). 20 | - Accessing specialised resources like scientific papers, package repositories (PyPI, npm), or Q&A 21 | sites (Stack Exchange). 22 | - Using WolframAlpha for calculations or fetching random data (UUIDs, numbers). 23 | - Calculating text hashes. 24 | 25 | ## Refining the Search 26 | 27 | The AI can tailor searches using the available parameters: 28 | 29 | ### The Query (`q`) 30 | 31 | The primary search input. Supports standard queries and SearXNG's specific syntax: 32 | 33 | - **Bang Prefixes (`!`):** Focuses the search on categories or engines (e.g. `!news`, `!images`, 34 | `!it`, `!repos`, `!pypi`, `!wa`, `!re`). Prefixes can be chained (e.g., `!it !q&a python async`). 35 | - **Keywords (No `!`):** Executes specific actions like calculations (`avg 1 2 3`), random data 36 | generation (`random uuid`), or hashing (`sha512 text`). 37 | 38 | ### Filtering by Time (`time_range`) 39 | 40 | Restricts results to a specific period (`day`, `month`, `year`), where supported by the underlying 41 | SearXNG engines. 42 | 43 | ### Content Safety (`safesearch`) 44 | 45 | Adjusts the filtering level for potentially explicit content: `0` (Off), `1` (Moderate - default), 46 | or `2` (Strict), engine permitting. 47 | 48 | ## Technical Details 49 | 50 | Key aspects of the tool's operation: 51 | 52 | - **Backend:** Relies on the SearXNG instance specified by the server's `SEARXNG_QUERY_URL` 53 | environment variable. 54 | - **Output Format:** Returns results exclusively in JSON format for straightforward parsing by the AI. 55 | - **Request Handling:** Uses the common `get_request` helper function (shared with the `web` tool) 56 | for managing HTTP requests, including redirects, timeouts, and connection errors. Errors are 57 | reported back to the AI. 58 | - **Parameter Exposure:** Only the parameters defined in `tools.yaml` (`q`, `time_range`, 59 | `safesearch`) are available to the AI. 60 | 61 | This tool gives the AI assistant a mechanism to query a SearXNG instance, enabling access to 62 | real-time web information and specialised search functions. 63 | ``` -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Build MCP Server 2 | 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.ref }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | pull_request: 9 | push: 10 | paths: 11 | - "server/**" 12 | - ".github/workflows/build-server.yml" 13 | workflow_dispatch: 14 | 15 | permissions: 16 | contents: read 17 | packages: write 18 | pull-requests: write 19 | actions: write 20 | 21 | jobs: 22 | test: 23 | name: Pytest 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v5 28 | 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v3 31 | 32 | - name: Build test image 33 | uses: docker/build-push-action@v6 34 | env: 35 | DOCKER_BUILD_SUMMARY: false 36 | DOCKER_BUILD_RECORD_UPLOAD: false 37 | with: 38 | context: server 39 | load: true 40 | build-args: | 41 | BUILD_ENV=dev 42 | tags: mcp-server:test 43 | cache-from: type=gha 44 | cache-to: type=gha,mode=max 45 | 46 | - name: Run tests and output results 47 | run: | 48 | set -o pipefail 49 | docker run --rm -e BUILD_ENV=dev mcp-server:test | tee pytest_output.txt 50 | exit_code=${PIPESTATUS[0]} 51 | echo '```' >> "$GITHUB_STEP_SUMMARY" 52 | cat pytest_output.txt >> "$GITHUB_STEP_SUMMARY" 53 | echo '```' >> "$GITHUB_STEP_SUMMARY" 54 | exit $exit_code 55 | 56 | build: 57 | name: Docker build 58 | needs: test 59 | runs-on: ubuntu-latest 60 | steps: 61 | - name: Checkout repository 62 | uses: actions/checkout@v5 63 | 64 | - name: Set up Docker Buildx 65 | uses: docker/setup-buildx-action@v3 66 | 67 | - name: Generate Docker metadata 68 | id: meta 69 | uses: docker/metadata-action@v5 70 | with: 71 | context: workflow 72 | images: | 73 | name=ghcr.io/${{ github.repository }}/server 74 | tags: | 75 | type=raw,value=latest,enable={{is_default_branch}} 76 | type=ref,event=branch 77 | type=ref,event=pr 78 | type=ref,event=tag 79 | type=sha,enable={{is_default_branch}},prefix=${{ github.event_name == 'pull_request' && 'pr-' || github.ref_name }}- 80 | 81 | - name: Log in to GitHub Container Registry 82 | if: github.event_name != 'pull_request' 83 | uses: docker/login-action@v3 84 | with: 85 | registry: ghcr.io 86 | username: ${{ github.actor }} 87 | password: ${{ secrets.GITHUB_TOKEN }} 88 | 89 | - name: Build and push production image 90 | uses: docker/build-push-action@v6 91 | env: 92 | DOCKER_BUILD_RECORD_UPLOAD: false 93 | with: 94 | context: server 95 | push: ${{ github.event_name != 'pull_request' }} 96 | tags: ${{ steps.meta.outputs.tags }} 97 | labels: ${{ steps.meta.outputs.labels }} 98 | build-args: | 99 | BUILD_ENV=prod 100 | cache-from: type=gha 101 | cache-to: type=gha,mode=max 102 | ``` -------------------------------------------------------------------------------- /server/mcp_server/tools/helpers.py: -------------------------------------------------------------------------------- ```python 1 | """Helper functions for the MCP fetch server tools. 2 | 3 | Provides shared utilities for error handling and web content retrieval: 4 | 5 | - Error formatting: Consistent XML-style error messages for AI parsing 6 | - HTTP client: Robust web content fetching with configurable User-Agent 7 | - Error handling: Detailed error messages for common network issues 8 | 9 | All functions handle errors gracefully and provide clear feedback to help 10 | AI assistants understand and resolve issues independently. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | from os import getenv as os_getenv 16 | 17 | from aiohttp import ( 18 | ClientConnectionError, 19 | ClientError, 20 | ClientResponseError, 21 | ClientSession as AiohttpClientSession, 22 | ServerTimeoutError, 23 | TooManyRedirects, 24 | ) 25 | from mcp.shared.exceptions import McpError 26 | from mcp.types import INTERNAL_ERROR, ErrorData 27 | 28 | 29 | def add_error(text: str, error: str, append: bool = True) -> str: 30 | """Append an error message to the string. 31 | 32 | Args: 33 | text: The string to append the error to. 34 | error: The error message to append. 35 | append: Whether to append or prepend the error. 36 | 37 | Returns: 38 | The string with the error message appended. 39 | 40 | """ 41 | return f"{text}\n\n<error>{error}</error>" if append else f"<error>{error}</error>\n\n{text}" 42 | 43 | 44 | async def get_request(url: str) -> str: 45 | """Fetch content from a URL asynchronously. 46 | 47 | Args: 48 | url: The URL to fetch. 49 | 50 | Returns: 51 | The fetched content as a string. 52 | 53 | Raises: 54 | McpError: If fetching or processing fails. 55 | 56 | """ 57 | errmsg: str = "" 58 | try: 59 | async with AiohttpClientSession( 60 | headers={ 61 | "User-Agent": os_getenv("USER_AGENT") 62 | or "Mozilla/5.0 (X11; Linux i686; rv:135.0) Gecko/20100101 Firefox/135.0" 63 | } 64 | ) as session: 65 | response = await session.get(url) 66 | response_text = (await response.text()).strip() 67 | if response.ok: 68 | if response_text: 69 | return response_text 70 | errmsg = f"Failed to fetch {url}: HTTP {response.status} with empty body" 71 | else: 72 | errmsg = f"Failed to fetch {url}: HTTP {response.status} ({response.reason})" 73 | except ServerTimeoutError as err: 74 | errmsg = f"Timeout while fetching {url}: {str(err)!r}" 75 | except ClientConnectionError as err: 76 | errmsg = f"Failed to connect to {url}: {str(err)!r}" 77 | except TooManyRedirects as err: 78 | errmsg = f"Too many redirects while fetching {url}: {str(err)!r}" 79 | except ClientResponseError as err: 80 | errmsg = f"HTTP error while fetching {url}: {err.status} - {err.message}" 81 | except ClientError as err: 82 | errmsg = f"Network error while fetching {url}: {str(err)!r}" 83 | except Exception as err: # noqa: BLE001 84 | errmsg = f"Unexpected error while fetching {url}: {str(err)!r}" 85 | 86 | raise McpError(ErrorData(code=INTERNAL_ERROR, message=errmsg)) 87 | ``` -------------------------------------------------------------------------------- /server/mcp_server/tools/search.py: -------------------------------------------------------------------------------- ```python 1 | """Provide a tool to query a SearXNG instance. 2 | 3 | Allows AI assistants to search the web using a configured SearXNG instance, 4 | leveraging its API for targeted and filtered searches. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from os import getenv as os_getenv 10 | from typing import Any 11 | from urllib.parse import urlencode 12 | 13 | from mcp.shared.exceptions import McpError 14 | from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ErrorData 15 | 16 | from .helpers import get_request 17 | 18 | # Allowed parameters for the SearXNG API, excluding 'q' which is handled separately. 19 | ALLOWED_PARAMS: set[str] = { 20 | "categories", 21 | "engines", 22 | "language", 23 | "pageno", 24 | "time_range", 25 | "format", 26 | "safesearch", 27 | } 28 | 29 | 30 | async def tool_search(q: str, **kwargs: Any) -> str: 31 | """Query a SearXNG instance using its Search API. 32 | 33 | Args: 34 | q: The search query string. 35 | **kwargs: Additional optional parameters for the SearXNG API 36 | (categories, engines, language, pageno, time_range, format, safesearch). 37 | 38 | Returns: 39 | The search results as a string (content depends on the 'format' parameter). 40 | 41 | Raises: 42 | McpError: If the SEARXNG_QUERY_URL environment variable is not set, 43 | if invalid parameters are provided, or if the request fails. 44 | """ 45 | searxng_url = os_getenv("SEARXNG_QUERY_URL") 46 | if not searxng_url: 47 | raise McpError( 48 | ErrorData(code=INTERNAL_ERROR, message="SearXNG query URL is not configured on the server.") 49 | ) 50 | 51 | # Filter out any provided kwargs that are not valid SearXNG parameters 52 | search_params = {k: v for k, v in kwargs.items() if k in ALLOWED_PARAMS and v is not None} 53 | search_params["q"] = q # Add the mandatory query 54 | 55 | # Default format to json if not specified, as it's often easiest for programmatic use 56 | if "format" not in search_params: 57 | search_params["format"] = "json" 58 | 59 | # Validate format if provided 60 | if search_params["format"] not in ("json", "csv", "rss"): 61 | raise McpError( 62 | ErrorData( 63 | code=INVALID_PARAMS, 64 | message=f"Invalid format '{search_params['format']}'. Must be 'json', 'csv', or 'rss'.", 65 | ) 66 | ) 67 | 68 | query_string = urlencode(search_params) 69 | full_url = f"{searxng_url}?{query_string}" 70 | 71 | try: 72 | # Use the existing get_request helper 73 | result = await get_request(full_url) 74 | # Simple check for empty result which might indicate no results found 75 | # depending on the format requested. SearXNG JSON format includes metadata even for no results. 76 | if not result and search_params["format"] != "json": 77 | return f"No results found for query '{q}' with specified parameters." 78 | except McpError as e: 79 | # Re-raise McpError to ensure it's handled correctly by the server 80 | raise McpError(ErrorData(code=e.data.code, message=f"SearXNG query failed: {e.data.message}")) from e 81 | except Exception as e: 82 | # Catch any other unexpected errors during the request 83 | raise McpError( 84 | ErrorData(code=INTERNAL_ERROR, message=f"Unexpected error during SearXNG query: {e!r}") 85 | ) from e 86 | else: 87 | return result 88 | ``` -------------------------------------------------------------------------------- /server/tools.yaml: -------------------------------------------------------------------------------- ```yaml 1 | tools: 2 | search: 3 | description: > 4 | Use this tool to access SearXNG to search the internet for current information or to perform 5 | calculations. Use this tool when the user asks about recent events, technical details, to find 6 | content, or your task requires calculations. If the search summary doesn't clearly answer the 7 | question, you can read one of the search results by providing the URL to the `web` tool, or use 8 | this `search` tool again to make further narrower requests to gain context to help your answer. 9 | inputSchema: 10 | type: object 11 | properties: 12 | q: 13 | type: string 14 | description: | 15 | SearXNG search query. Use `!` prefixes for categories/engines (chainable to search multiple sources) followed by your query: 16 | - General: `!news <query>`, `!map <place>`, `!images <keywords>` 17 | - Multimedia: `!videos` (PeerTube/Vimeo/YouTube), `!music` (Bandcamp/SoundCloud/YouTube), `!lyrics`, `!yt` (YouTube specific) 18 | - Files: `!files` (books/apps/torrents), `!1337x` or `!kc` or `!solid` or `!tpb` (Torrents), `!gpa` (Google Play), `!wcf` (Wikimedia Commons) 19 | - IT/Dev: `!it` (all tech), `!repos` (Git repos), `!dh` (Docker Hub), `!q&a` (Stack Ex.), `!mdn` (Web Docs), `!software_wikis` (Linux/dev wikis) 20 | - Packages: `!pypi` (Python), `!npm` (Node), `!crates` or `!lrs` (Rust), `!alp` (Alpine Linux) 21 | - Science/Compute: `!scientific_publications` (arXiv/PubMed/etc), `!wa` (WolframAlpha calculations/facts/definitions) 22 | - Social: `!re` (Reddit) 23 | Special keywords (no `!`): 24 | - Stats: `avg 1 2 3`, `max`, `min`, `sum`, `prod` 25 | - Random: `random color`, `random int`, `random string`, `random uuid` 26 | - Hash: `sha512 text` 27 | time_range: 28 | type: string 29 | enum: ["day", "month", "year"] 30 | description: Filter results by time range if supported 31 | safesearch: 32 | type: integer 33 | enum: [0, 1, 2] 34 | description: Safe search level (0=Off, 1=Moderate, 2=Strict) if supported 35 | default: 1 36 | required: ["q"] 37 | web: 38 | description: > 39 | Use this tool to access live web pages using their URL. This is crucial for providing users 40 | with accurate information from up-to-date sources. You will typically want to use `markdown` 41 | to read content, or use 'links' mode to extract hyperlinks to find related pages on a site, 42 | e.g. for navigating documentation. 43 | inputSchema: 44 | type: object 45 | properties: 46 | url: 47 | type: string 48 | description: URL to access - must be a complete and valid web address. 49 | mode: 50 | type: string 51 | enum: 52 | - markdown 53 | - raw 54 | - links 55 | description: | 56 | Processing mode: 57 | - `markdown` (default) for clean readable text 58 | - `links` to list all hyperlinks 59 | - `raw` for unprocessed content (code, JSON, etc) 60 | default: markdown 61 | max_length: 62 | type: integer 63 | description: Optional character limit for the response (0 = no limit). 64 | default: 0 65 | required: ["url"] 66 | ``` -------------------------------------------------------------------------------- /server/tests/test_web.py: -------------------------------------------------------------------------------- ```python 1 | """Test the web content retrieval and processing tools.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pytest 6 | from mcp.shared.exceptions import McpError 7 | 8 | from mcp_server.tools.web import ProcessingMode, WebProcessor, tool_web 9 | 10 | 11 | @pytest.fixture 12 | def mock_html_content() -> str: 13 | """Return sample HTML content for testing. 14 | 15 | Returns: 16 | Sample HTML content as a string 17 | """ 18 | return """ 19 | <html> 20 | <body> 21 | <h1>Test Page</h1> 22 | <p>This is a test paragraph.</p> 23 | <a href="https://example.com">Example Link</a> 24 | <a href="/relative/path">Relative Link</a> 25 | <a href="#skip">Skip Link</a> 26 | <a href="javascript:void(0)">JavaScript Link</a> 27 | </body> 28 | </html> 29 | """ 30 | 31 | 32 | def test_processing_mode_from_str() -> None: 33 | """Test conversion of strings to ProcessingMode enum values.""" 34 | if ProcessingMode.from_str("markdown") != ProcessingMode.MARKDOWN: 35 | pytest.fail("Failed to convert 'markdown' to ProcessingMode.MARKDOWN") 36 | if ProcessingMode.from_str("raw") != ProcessingMode.RAW: 37 | pytest.fail("Failed to convert 'raw' to ProcessingMode.RAW") 38 | if ProcessingMode.from_str("links") != ProcessingMode.LINKS: 39 | pytest.fail("Failed to convert 'links' to ProcessingMode.LINKS") 40 | if ProcessingMode.from_str("invalid") != ProcessingMode.RAW: 41 | pytest.fail("Failed to convert invalid mode to ProcessingMode.RAW") 42 | 43 | 44 | @pytest.mark.asyncio 45 | async def test_web_processor_links(monkeypatch: pytest.MonkeyPatch, mock_html_content: str) -> None: 46 | """Test extraction and formatting of links from web content.""" 47 | 48 | async def mock_get_request(_url: str) -> str: 49 | return mock_html_content 50 | 51 | monkeypatch.setattr("mcp_server.tools.web.get_request", mock_get_request) 52 | 53 | processor = WebProcessor("https://test.com", mode=ProcessingMode.LINKS) 54 | result = await processor.process() 55 | 56 | if "Example Link: https://example.com" not in result: 57 | pytest.fail(f"Missing absolute link in output: {result}") 58 | if "https://test.com/relative/path" not in result: 59 | pytest.fail(f"Missing resolved relative link in output: {result}") 60 | if "#skip" in result: 61 | pytest.fail(f"Found invalid anchor link in output: {result}") 62 | if "javascript:void(0)" in result: 63 | pytest.fail(f"Found invalid JavaScript link in output: {result}") 64 | 65 | 66 | @pytest.mark.asyncio 67 | async def test_web_processor_markdown(monkeypatch: pytest.MonkeyPatch) -> None: 68 | """Test conversion of HTML content to markdown format.""" 69 | 70 | async def mock_get_request(_url: str) -> str: 71 | return """ 72 | <!DOCTYPE html> 73 | <html> 74 | <head><title>Test Page</title></head> 75 | <body> 76 | <article> 77 | <h1>Test Heading</h1> 78 | <p>This is a test paragraph with some <strong>bold text</strong>.</p> 79 | <p>And another paragraph for good measure.</p> 80 | </article> 81 | </body> 82 | </html> 83 | """ 84 | 85 | monkeypatch.setattr("mcp_server.tools.web.get_request", mock_get_request) 86 | 87 | processor = WebProcessor("https://test.com", mode=ProcessingMode.MARKDOWN) 88 | result = await processor.process() 89 | 90 | if "Test Heading" not in result: 91 | pytest.fail(f"Missing heading content in output: {result}") 92 | if "test paragraph" not in result: 93 | pytest.fail(f"Missing paragraph content in output: {result}") 94 | 95 | 96 | @pytest.mark.asyncio 97 | async def test_max_length_limit() -> None: 98 | """Test truncation of content based on max_length parameter.""" 99 | processor = WebProcessor("https://test.com", max_length=10) 100 | content = "This is a very long text that should be truncated" 101 | 102 | truncated = processor._format_links({"https://test.com": content}) # noqa: SLF001 103 | if len(truncated) > processor.max_length + 100: # Allow for header text 104 | pytest.fail(f"Content exceeds max length: {len(truncated)} > {processor.max_length + 100}") 105 | 106 | 107 | @pytest.mark.asyncio 108 | async def test_invalid_url() -> None: 109 | """Test error handling for invalid URLs.""" 110 | try: 111 | await tool_web("not-a-url") 112 | pytest.fail("Expected McpError for invalid URL") 113 | except McpError: 114 | pass 115 | 116 | 117 | @pytest.mark.asyncio 118 | async def test_empty_links() -> None: 119 | """Test error handling when no links are found.""" 120 | processor = WebProcessor("https://test.com", mode=ProcessingMode.LINKS) 121 | try: 122 | processor._format_links({}) # noqa: SLF001 123 | pytest.fail("Expected McpError for empty links") 124 | except McpError: 125 | pass 126 | ``` -------------------------------------------------------------------------------- /docs/web.md: -------------------------------------------------------------------------------- ```markdown 1 | # Web Tool 2 | 3 | 1. [What can it do?](#what-can-it-do) 4 | 2. [Processing Modes](#processing-modes) 5 | 1. [Markdown Mode (default)](#markdown-mode-default) 6 | 2. [Links Mode](#links-mode) 7 | 3. [Raw Mode](#raw-mode) 8 | 3. [Features and Limits](#features-and-limits) 9 | 1. [Content Management](#content-management) 10 | 2. [Safety Features](#safety-features) 11 | 12 | A tool that lets AI assistants access and process web content safely. It can convert pages to 13 | markdown, extract links, or get raw content - helping the AI give you more accurate, up-to-date 14 | information. 15 | 16 |  17 | 18 | ## What can it do? 19 | 20 | When you're discussing documentation, researching solutions, or need current information, the AI 21 | can access web content to help. It's particularly useful when you want to: 22 | 23 | - Get the latest documentation for a library or tool 24 | - Find code examples that match your specific needs 25 | - Navigate through complex documentation structures 26 | - Verify that advice is current and accurate 27 | 28 | The tool handles all the technical details like following redirects, handling errors, and cleaning 29 | up messy HTML. You just point the AI at a URL, and it'll bring back the information in a format 30 | that's easy to work with. 31 | 32 | ## Processing Modes 33 | 34 | The AI can process web content in three different ways, each designed for specific needs: 35 | 36 | ### Markdown Mode (default) 37 | 38 | Most of the time, you'll want clean, readable content without the clutter of web formatting. 39 | Markdown mode automatically removes adverts, navigation menus, and other distractions, focusing on 40 | the actual content you care about. It preserves important elements like headings, lists, tables, 41 | and images, converting them into clean markdown that's easy to read. 42 | 43 | If something goes wrong with the conversion, the tool automatically falls back to raw content, 44 | letting the AI still help you even if the page is unusually formatted. 45 | 46 | Example output: 47 | 48 | ```markdown 49 | Contents of https://example.com/article: 50 | 51 | # Main Heading 52 | 53 | Article content in clean markdown format... 54 | 55 | ## Subheadings preserved 56 | 57 | * Lists kept intact 58 | * With proper formatting 59 | 60 |  61 | 62 | | Tables | Converted | 63 | |--------|-----------| 64 | | To | Markdown | 65 | ``` 66 | 67 | ### Links Mode 68 | 69 | When you're exploring documentation or need to navigate through a website, links mode helps map 70 | out the available paths. It finds all the links on a page, converts them to absolute URLs so they 71 | always work, and shows you the text used to describe each link. This is particularly helpful when 72 | you need to: 73 | 74 | - Navigate through multi-page documentation 75 | - Find related articles or resources 76 | - Locate specific sections in large documents 77 | - Build a map of available information 78 | 79 | The AI orders links by relevance, filters out noise like social media buttons, and gives you a 80 | clean list of where you can go next. 81 | 82 | Example output: 83 | 84 | ```markdown 85 | All 45 links found on https://example.com 86 | 87 | - Home: https://example.com/ 88 | - Products: https://example.com/products 89 | - About Us: https://example.com/about 90 | ... 91 | ``` 92 | 93 | ### Raw Mode 94 | 95 | Sometimes you need the original, unprocessed content - particularly when working with APIs, 96 | downloading code, or accessing structured data. Raw mode gives you exactly what the server sends, 97 | while still handling things like authentication, redirects, and error handling behind the scenes. 98 | 99 | ## Features and Limits 100 | 101 | The tool includes several features to make web access both powerful and safe: 102 | 103 | ### Content Management 104 | 105 | The AI can handle content of any size, but you can control how much it processes at once. Setting 106 | a length limit helps when you're working with large documents or want to focus on specific 107 | sections. You'll always get complete sentences and properly formatted content, with clear warnings 108 | if anything gets truncated. 109 | 110 | If something goes wrong - whether it's a network issue, an authentication problem, or just an 111 | unusually formatted page - you'll get clear, actionable error messages explaining what happened 112 | and often suggesting how to fix it. 113 | 114 | ### Safety Features 115 | 116 | Behind the scenes, the tool uses industrial-strength libraries like `trafilatura` and 117 | `BeautifulSoup` to handle web content safely. It carefully processes URLs, headers, and content to 118 | prevent common issues, while giving you the flexibility to access the resources you need. 119 | 120 | The tool strikes a careful balance - giving AI assistants broad access to web content while 121 | maintaining security and providing clear feedback. This means you can confidently point the AI at 122 | documentation or resources, knowing it'll handle the technical details and bring back exactly what 123 | you need. 124 | ``` -------------------------------------------------------------------------------- /server/mcp_server/server.py: -------------------------------------------------------------------------------- ```python 1 | """Core MCPServer implementation for the MCP fetch service. 2 | 3 | Provides a generic MCPServer class for serving MCP requests. Allows drop-in tool support by mapping 4 | tool functions to configuration loaded from an external YAML file. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from dataclasses import dataclass, field 10 | from os import getenv as os_getenv 11 | from pathlib import Path 12 | from typing import TYPE_CHECKING, Any 13 | 14 | from mcp.server import Server as BaseMCPServer 15 | from mcp.server.sse import SseServerTransport 16 | from mcp.server.stdio import stdio_server 17 | from mcp.shared.exceptions import McpError 18 | from mcp.types import INVALID_PARAMS, ErrorData, TextContent, Tool 19 | from starlette.applications import Starlette 20 | from starlette.routing import Mount, Route 21 | from uvicorn import Config as UvicornConfig, Server as UvicornServer 22 | 23 | if TYPE_CHECKING: 24 | from starlette.requests import Request 25 | from starlette.responses import Response 26 | 27 | # Default path for tool configuration YAML file 28 | DEFAULT_TOOL_CONFIG_PATH = Path(__file__).parent / "tools.yaml" 29 | 30 | 31 | @dataclass(slots=True) 32 | class MCPServer: 33 | """Define a generic MCP server class with drop-in tool support.""" 34 | 35 | config: dict[str, Any] 36 | server: BaseMCPServer = field(init=False) 37 | server_name: str = field(default="mcp-server") 38 | tools: list[Tool] = field(default_factory=list) 39 | 40 | def __post_init__(self) -> None: 41 | """Initialise the MCPServer.""" 42 | if self.config.get("server", {}).get("name"): 43 | self.server_name = self.config["server"]["name"] 44 | # Create MCP server instance 45 | self.server = BaseMCPServer(self.server_name) 46 | # Build the tool registry and tool list 47 | self.tools = [ 48 | Tool(name=name, **{k: v for k, v in tool.items() if k != "method"}) 49 | for name, tool in self.config["tools"].items() 50 | ] 51 | # Register the tool listing/calling methods 52 | self.server.list_tools()(self.list_tools) 53 | self.server.call_tool()(self.call_tool) 54 | 55 | async def list_tools(self) -> list[Tool]: 56 | """Return a list of available tools. 57 | 58 | Returns: 59 | A list of Tool objects representing the available tools. 60 | """ 61 | return self.tools 62 | 63 | async def call_tool(self, name: str, arguments: dict) -> list[TextContent]: 64 | """Call the tool specified by name with provided arguments. 65 | 66 | Returns: 67 | A list of TextContent objects containing the tool's result 68 | 69 | Raises: 70 | McpError: If the tool is unknown or fails to execute 71 | """ 72 | if name not in self.config["tools"]: 73 | raise McpError( 74 | ErrorData( 75 | code=INVALID_PARAMS, message=f"Tool '{name}' isn't available on this server anymore" 76 | ) 77 | ) 78 | if "method" not in self.config["tools"][name]: 79 | raise McpError( 80 | ErrorData( 81 | code=INVALID_PARAMS, 82 | message=( 83 | f"Tool '{name}' has no registered method: inform the user that their MCP " 84 | "server requires configuration to provide a function for this tool." 85 | ), 86 | ) 87 | ) 88 | try: 89 | result = await self.config["tools"][name]["method"](**arguments) 90 | return [TextContent(type="text", text=result)] 91 | except McpError as err: 92 | raise McpError(ErrorData(code=INVALID_PARAMS, message=str(err))) from err 93 | 94 | async def serve(self) -> None: 95 | """Run the MCP server, using either SSE or stdio mode.""" 96 | options = self.server.create_initialization_options() 97 | sse_host, sse_port = os_getenv("SSE_HOST"), os_getenv("SSE_PORT") 98 | if sse_host and sse_port: 99 | sse = SseServerTransport("/messages/") 100 | 101 | async def _handle_sse(request: Request) -> Response | None: 102 | """Handle incoming SSE connection.""" 103 | async with sse.connect_sse( 104 | request.scope, 105 | request.receive, 106 | request._send, # noqa: SLF001 107 | ) as streams: 108 | await self.server.run(streams[0], streams[1], options, raise_exceptions=True) 109 | 110 | starlette_app = Starlette( 111 | debug=True, 112 | routes=[ 113 | Route("/sse", endpoint=_handle_sse), 114 | Mount("/messages/", app=sse.handle_post_message), 115 | ], 116 | ) 117 | 118 | config = UvicornConfig(app=starlette_app, host=sse_host, port=int(sse_port), log_level="info") 119 | server_instance = UvicornServer(config) 120 | await server_instance.serve() 121 | else: 122 | async with stdio_server() as (read_stream, write_stream): 123 | await self.server.run(read_stream, write_stream, options, raise_exceptions=True) 124 | ``` -------------------------------------------------------------------------------- /server/mcp_server/tools/web.py: -------------------------------------------------------------------------------- ```python 1 | """Provide tools to retrieve and process web content. 2 | 3 | Helps AI assistants access and understand web content through three processing modes: 4 | 5 | - markdown: Converts HTML to clean, readable markdown (default) 6 | - links: Extracts and formats hyperlinks with their anchor text 7 | - raw: Returns unprocessed content for APIs or non-HTML resources 8 | 9 | Features include: 10 | - Smart content extraction focusing on main text 11 | - Link processing with relative URL resolution 12 | - Configurable length limits 13 | - Detailed error messages for common issues 14 | """ 15 | 16 | from __future__ import annotations 17 | 18 | from collections import Counter 19 | from dataclasses import dataclass, field 20 | from enum import Enum 21 | from typing import Final 22 | from urllib.parse import urljoin 23 | 24 | from bs4 import BeautifulSoup, Tag 25 | from bs4.filter import SoupStrainer 26 | from mcp.shared.exceptions import McpError 27 | from mcp.types import INTERNAL_ERROR, ErrorData 28 | from trafilatura import extract as trafilatura_extract 29 | 30 | from .helpers import add_error, get_request 31 | 32 | 33 | class ProcessingMode(Enum): 34 | """Define valid content processing modes.""" 35 | 36 | MARKDOWN = "markdown" 37 | RAW = "raw" 38 | LINKS = "links" 39 | 40 | @classmethod 41 | def from_str(cls, mode: str) -> ProcessingMode: 42 | """Create ProcessingMode from string, defaulting to RAW if invalid. 43 | 44 | Args: 45 | mode: String representation of the processing mode 46 | 47 | Returns: 48 | ProcessingMode enum value 49 | """ 50 | try: 51 | return cls(mode.lower()) 52 | except ValueError: 53 | return cls.RAW 54 | 55 | 56 | SKIP_HREF_PREFIXES: Final = ("#", "javascript:") 57 | 58 | 59 | @dataclass(slots=True) 60 | class WebProcessor: 61 | """Handle web content retrieval and processing.""" 62 | 63 | url: str 64 | mode: ProcessingMode | str = field(default=ProcessingMode.MARKDOWN) 65 | max_length: int = field(default=0) 66 | 67 | def __post_init__(self) -> None: 68 | """Validate and correct inputs as needed.""" 69 | if isinstance(self.mode, str): 70 | self.mode = ProcessingMode.from_str(self.mode) 71 | self.max_length = max(self.max_length, 0) 72 | 73 | async def process(self) -> str: 74 | """Fetch and process the content according to the specified mode. 75 | 76 | Returns: 77 | Processed content as a string 78 | """ 79 | content = await get_request(self.url) 80 | 81 | match self.mode: 82 | case ProcessingMode.LINKS: 83 | return self._format_links(self._extract_links(content)) 84 | 85 | case ProcessingMode.MARKDOWN: 86 | extracted = trafilatura_extract( 87 | content, 88 | favor_recall=True, 89 | include_formatting=True, 90 | include_images=True, 91 | include_links=True, 92 | include_tables=True, 93 | output_format="markdown", 94 | with_metadata=True, 95 | ) or add_error(content, "Extraction to markdown failed; returning raw content", append=False) 96 | 97 | case ProcessingMode.RAW: 98 | extracted = content 99 | 100 | if self.max_length > 0 and len(extracted) > self.max_length: 101 | extracted = add_error( 102 | extracted[: self.max_length], 103 | f"Content truncated to {self.max_length} characters", 104 | append=True, 105 | ) 106 | 107 | return f"Contents of {self.url}:\n\n{extracted}" 108 | 109 | def _get_absolute_url(self, href: str) -> str | None: 110 | """Get the absolute URL from a relative or absolute href. 111 | 112 | Returns: 113 | Absolute URL or None if invalid 114 | """ 115 | stripped = href.strip() 116 | if not stripped or any(stripped.startswith(prefix) for prefix in SKIP_HREF_PREFIXES): 117 | return None 118 | return stripped if stripped.startswith(("http://", "https://")) else urljoin(self.url, stripped) 119 | 120 | def _extract_links(self, content: str) -> dict[str, str]: 121 | """Extract all valid links with their anchor text. 122 | 123 | Returns: 124 | Dictionary mapping each unique absolute URL to its first-found anchor text 125 | """ 126 | soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer("a", href=True)) 127 | 128 | anchors = [a for a in soup.find_all("a", href=True) if isinstance(a, Tag)] 129 | valid_anchors = [ 130 | (a, url) 131 | for a in anchors 132 | if (href := a.get("href")) and isinstance(href, str) and (url := self._get_absolute_url(href)) 133 | ] 134 | 135 | url_counts = Counter(url for _, url in valid_anchors) 136 | 137 | return dict( 138 | sorted( 139 | { 140 | url: next(a.get_text(strip=True) for a, anchor_url in valid_anchors if anchor_url == url) 141 | for url in url_counts 142 | }.items(), 143 | key=lambda x: (-url_counts[x[0]], x[0]), 144 | ) 145 | ) 146 | 147 | def _format_links(self, links: dict[str, str]) -> str: 148 | """Format extracted links into a readable string. 149 | 150 | Args: 151 | links: Dictionary of URLs and their titles 152 | 153 | Returns: 154 | Formatted string of links 155 | 156 | Raises: 157 | McpError: If no links are found 158 | """ 159 | if not links: 160 | raise McpError( 161 | ErrorData( 162 | code=INTERNAL_ERROR, 163 | message=f"No links found on {self.url} - it may require JavaScript or auth.", 164 | ) 165 | ) 166 | 167 | total_links = len(links) 168 | formatted_links = [] 169 | length = 0 170 | 171 | for url, title in links.items(): 172 | link_text = f"- {title}: {url}" if title else f"- {url}" 173 | new_length = length + len(link_text) + 1 174 | 175 | if self.max_length > 0 and new_length > self.max_length: 176 | break 177 | 178 | formatted_links.append(link_text) 179 | length = new_length 180 | 181 | added_count = len(formatted_links) 182 | header = ( 183 | f"{added_count} of {total_links} links found on {self.url}" 184 | if added_count < total_links 185 | else f"All {total_links} links found on {self.url}" 186 | ) 187 | 188 | return f"{header}\n" + "\n".join(formatted_links) 189 | 190 | 191 | async def tool_web(url: str, mode: str = "markdown", max_length: int = 0) -> str: 192 | """Access and process web content from a given URL. 193 | 194 | Returns: 195 | Processed content as a string 196 | """ 197 | processor = WebProcessor(url=url, mode=mode, max_length=max_length) 198 | return await processor.process() 199 | ``` -------------------------------------------------------------------------------- /server/tests/test_server.py: -------------------------------------------------------------------------------- ```python 1 | """Test the MCP server initialization and configuration.""" 2 | 3 | from __future__ import annotations 4 | 5 | from os import environ as os_environ 6 | from pathlib import Path 7 | from typing import TYPE_CHECKING 8 | 9 | import pytest 10 | import pytest_asyncio 11 | from yaml import dump as yaml_dump, safe_load as yaml_safe_load 12 | 13 | from mcp_server.server import MCPServer 14 | from mcp_server.tools import tool_search, tool_web 15 | 16 | if TYPE_CHECKING: 17 | from collections.abc import Generator 18 | 19 | # Constants for testing 20 | MAX_DESCRIPTION_LENGTH = 1024 21 | 22 | 23 | @pytest.fixture 24 | def mock_yaml_file(tmp_path: Path) -> Path: 25 | """Create a temporary tools.yaml file for testing. 26 | 27 | Args: 28 | tmp_path: Pytest fixture providing temporary directory 29 | 30 | Returns: 31 | Path to the temporary YAML file 32 | """ 33 | yaml_content = { 34 | "tools": { 35 | "search": { 36 | "description": "Test Search tool", 37 | "inputSchema": {"type": "object", "properties": {"query": {"type": "string"}}}, 38 | }, 39 | "web": { 40 | "description": "Test Web tool", 41 | "inputSchema": {"type": "object", "properties": {"url": {"type": "string"}}}, 42 | }, 43 | } 44 | } 45 | 46 | yaml_path = tmp_path / "tools.yaml" 47 | yaml_path.write_text(yaml_dump(yaml_content), encoding="utf-8") 48 | return yaml_path 49 | 50 | 51 | @pytest.fixture 52 | def server_env() -> Generator[None]: 53 | """Set up server environment variables for testing.""" 54 | os_environ["SANDBOX"] = "127.0.0.1:8080" 55 | os_environ["SSE_HOST"] = "127.0.0.1" 56 | os_environ["SSE_PORT"] = "3001" 57 | os_environ["USER_AGENT"] = "TestAgent/1.0" 58 | yield 59 | for key in ["SANDBOX", "SSE_HOST", "SSE_PORT", "USER_AGENT"]: 60 | if key in os_environ: 61 | del os_environ[key] 62 | 63 | 64 | @pytest_asyncio.fixture 65 | async def server(mock_yaml_file: Path) -> MCPServer: 66 | """Create a test server instance. 67 | 68 | Args: 69 | mock_yaml_file: Path to test YAML configuration 70 | 71 | Returns: 72 | Configured MCPServer instance 73 | """ 74 | config = yaml_safe_load(mock_yaml_file.read_text(encoding="utf-8")) 75 | config["tools"]["search"]["method"] = tool_search 76 | config["tools"]["web"]["method"] = tool_web 77 | return MCPServer(config) 78 | 79 | 80 | def test_yaml_loading(mock_yaml_file: Path) -> None: 81 | """Test that the YAML configuration can be loaded correctly.""" 82 | config = yaml_safe_load(mock_yaml_file.read_text(encoding="utf-8")) 83 | 84 | if "tools" not in config: 85 | pytest.fail("Missing 'tools' section in config") 86 | if "search" not in config["tools"]: 87 | pytest.fail("Missing 'search' tool in config") 88 | if "web" not in config["tools"]: 89 | pytest.fail("Missing 'web' tool in config") 90 | 91 | for tool_name in ("search", "web"): 92 | if "description" not in config["tools"][tool_name]: 93 | pytest.fail(f"Missing 'description' in {tool_name} tool config") 94 | 95 | description_length = len(config["tools"][tool_name]["description"]) 96 | if description_length > MAX_DESCRIPTION_LENGTH: 97 | pytest.fail( 98 | f"Description for tool '{tool_name}' is too long: " 99 | f"{description_length} characters (max {MAX_DESCRIPTION_LENGTH})" 100 | ) 101 | 102 | 103 | def test_server_initialisation(server: MCPServer) -> None: 104 | """Test that the server initializes with the correct tools.""" 105 | if not hasattr(server, "tools"): 106 | pytest.fail("Server missing tools attribute") 107 | tool_names = {tool.name for tool in server.tools} 108 | if "search" not in tool_names: 109 | pytest.fail("Server missing search tool") 110 | if "web" not in tool_names: 111 | pytest.fail("Server missing web tool") 112 | 113 | search_tool_config = server.config["tools"]["search"] 114 | web_tool_config = server.config["tools"]["web"] 115 | 116 | if search_tool_config.get("method") != tool_search: 117 | pytest.fail("Search tool has incorrect method") 118 | if web_tool_config.get("method") != tool_web: 119 | pytest.fail("Web tool has incorrect method") 120 | 121 | 122 | @pytest.mark.asyncio 123 | @pytest.mark.usefixtures("server_env") 124 | async def test_server_environment() -> None: 125 | """Test that environment variables are correctly set.""" 126 | if os_environ["SANDBOX"] != "127.0.0.1:8080": 127 | pytest.fail(f"Incorrect SANDBOX: {os_environ['SANDBOX']}") 128 | if os_environ["SSE_HOST"] != "127.0.0.1": 129 | pytest.fail(f"Incorrect SSE_HOST: {os_environ['SSE_HOST']}") 130 | if os_environ["SSE_PORT"] != "3001": 131 | pytest.fail(f"Incorrect SSE_PORT: {os_environ['SSE_PORT']}") 132 | if os_environ["USER_AGENT"] != "TestAgent/1.0": 133 | pytest.fail(f"Incorrect USER_AGENT: {os_environ['USER_AGENT']}") 134 | 135 | 136 | def test_live_tools_yaml_file() -> None: 137 | """Test that the live tools.yaml file is readable and contains required keys.""" 138 | # Determine the project root (assumed one level above the tests directory) 139 | project_root = Path(__file__).parent.parent 140 | tools_yaml_path = project_root / "tools.yaml" 141 | if not tools_yaml_path.exists(): 142 | pytest.fail(f"tools.yaml file not found at {tools_yaml_path}") 143 | 144 | config = yaml_safe_load(tools_yaml_path.read_text(encoding="utf-8")) 145 | 146 | if "tools" not in config: 147 | pytest.fail("Missing 'tools' section in live tools.yaml") 148 | 149 | for tool in ("search", "web"): 150 | if tool not in config["tools"]: 151 | pytest.fail(f"Missing '{tool}' configuration in live tools.yaml") 152 | if "inputSchema" not in config["tools"][tool]: 153 | pytest.fail(f"Missing 'inputSchema' for tool '{tool}' in live tools.yaml") 154 | 155 | 156 | def test_tool_description_length() -> None: 157 | """Test that tool descriptions don't exceed the OpenAI API limit of 1024 characters.""" 158 | # Determine the project root (assumed one level above the tests directory) 159 | project_root = Path(__file__).parent.parent 160 | tools_yaml_path = project_root / "tools.yaml" 161 | if not tools_yaml_path.exists(): 162 | pytest.fail(f"tools.yaml file not found at {tools_yaml_path}") 163 | 164 | config = yaml_safe_load(tools_yaml_path.read_text(encoding="utf-8")) 165 | 166 | if "tools" not in config: 167 | pytest.fail("Missing 'tools' section in tools.yaml") 168 | 169 | for tool_name, tool_config in config["tools"].items(): 170 | if "description" not in tool_config: 171 | pytest.fail(f"Missing 'description' for tool '{tool_name}' in tools.yaml") 172 | 173 | description_length = len(tool_config["description"]) 174 | if description_length > MAX_DESCRIPTION_LENGTH: 175 | pytest.fail( 176 | f"Description for tool '{tool_name}' is too long: " 177 | f"{description_length} characters (max {MAX_DESCRIPTION_LENGTH})" 178 | ) 179 | ```