# Directory Structure
```
├── README.md
└── scrape_mcp_server.py
```
# Files
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # Claude Web Scraper MCP
2 |
3 | A simple Model Context Protocol (MCP) server that connects Claude for Desktop to a locally running eGet web scraper. This allows Claude to scrape website content through your local API.
4 |
5 | ## Prerequisites
6 |
7 | - Claude for Desktop
8 | - Python 3.7+
9 | - eGet web scraper (from https://github.com/vishwajeetdabholkar/eGet-Crawler-for-ai)
10 |
11 | ## Setup Instructions
12 |
13 | ### 1. Set up eGet Web Scraper
14 |
15 | First, make sure you have the eGet web scraper running:
16 |
17 | ```bash
18 | # Clone the eGet repository
19 | git clone https://github.com/vishwajeetdabholkar/eGet-Crawler-for-ai
20 | cd eGet-Crawler-for-ai
21 |
22 | # Set up and run eGet according to its instructions
23 | # (typically using Docker or local Python installation)
24 |
25 | # Verify the API is running (default: http://localhost:8000/api/v1/scrape)
26 | ```
27 |
28 | ### 2. Set up the MCP Server
29 |
30 | ```bash
31 | # Create project directory
32 | mkdir claude-scraper-mcp
33 | cd claude-scraper-mcp
34 |
35 | # Set up UV and virtual environment
36 | uv venv
37 | source .venv/bin/activate # On Windows: .venv\Scripts\activate
38 |
39 | # Install dependencies
40 | uv add "mcp[cli]" httpx
41 |
42 | # Create the MCP server script
43 | touch scrape_mcp_server.py
44 | ```
45 |
46 | Copy the `scrape_mcp_server.py` code into the file.
47 |
48 | ### 3. Configure Claude for Desktop
49 |
50 | 1. Create or edit the Claude desktop configuration:
51 |
52 | ```bash
53 | # On macOS
54 | mkdir -p ~/Library/Application\ Support/Claude/
55 | ```
56 |
57 | 2. Add this configuration to `~/Library/Application Support/Claude/claude_desktop_config.json`:
58 |
59 | ```json
60 | {
61 | "mcpServers": {
62 | "scrape-service": {
63 | "command": "/absolute/path/to/claude-scraper-mcp/.venv/bin/python",
64 | "args": [
65 | "/absolute/path/to/claude-scraper-mcp/scrape_mcp_server.py"
66 | ]
67 | }
68 | }
69 | }
70 | ```
71 |
72 | Replace the paths with the actual absolute paths to your virtual environment and script.
73 |
74 | 3. Restart Claude for Desktop
75 |
76 | ## Usage
77 |
78 | Once set up, you can use Claude to scrape websites with commands like:
79 |
80 | - "Scrape the content from https://example.com and summarize it"
81 | - "Get information about the website at https://news.ycombinator.com"
82 |
83 | ## Troubleshooting
84 |
85 | If you encounter issues:
86 |
87 | 1. Check that eGet scraper is running
88 | 2. Verify the API endpoint in the script matches your eGet configuration
89 | 3. Make sure Claude for Desktop is using the correct Python interpreter
90 | 4. Restart Claude for Desktop after making changes to the configuration
```
--------------------------------------------------------------------------------
/scrape_mcp_server.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Any, Dict, Optional, List
2 | import httpx
3 | from mcp.server.fastmcp import FastMCP
4 |
5 | # Initialize FastMCP server
6 | mcp = FastMCP("scrape-service")
7 |
8 | # Constants
9 | API_ENDPOINT = "http://localhost:8000/api/v1/scrape"
10 | TIMEOUT = 30 # seconds
11 |
12 | async def make_scrape_request(params: Dict[str, Any]) -> Dict[str, Any]:
13 | """Make a request to the scrape API with proper error handling."""
14 | async with httpx.AsyncClient() as client:
15 | try:
16 | response = await client.post(
17 | API_ENDPOINT,
18 | json=params,
19 | timeout=TIMEOUT
20 | )
21 | response.raise_for_status()
22 | return response.json()
23 | except Exception as e:
24 | return {"success": False, "error": str(e)}
25 |
26 | @mcp.tool()
27 | async def scrape_url(
28 | url: str,
29 | get_full_content: bool = True,
30 | only_main_content: bool = True
31 | ) -> str:
32 | """Scrape content from a URL and return the content.
33 |
34 | Args:
35 | url: The URL to scrape
36 | get_full_content: Whether to get full content or just metadata
37 | only_main_content: Whether to extract only the main content
38 | """
39 | # Configure parameters
40 | params = {
41 | "url": url,
42 | "formats": ["markdown", "html"],
43 | "onlyMainContent": only_main_content,
44 | "includeRawHtml": False,
45 | "includeScreenshot": False
46 | }
47 |
48 | # Make the API call
49 | result = await make_scrape_request(params)
50 |
51 | if not result.get("success", False):
52 | error_msg = result.get("error", "Unknown error")
53 | return f"Error scraping {url}: {error_msg}"
54 |
55 | data = result.get("data", {})
56 | metadata = data.get("metadata", {})
57 | title = metadata.get("title", "No title")
58 | description = metadata.get("description", "")
59 |
60 | # If only metadata is requested
61 | if not get_full_content:
62 | # Format links for display
63 | links = data.get("links", [])
64 | formatted_links = '\n'.join([f"- {link}" for link in links[:5]])
65 | if len(links) > 5:
66 | formatted_links += f"\n... and {len(links) - 5} more links"
67 |
68 | return f"""
69 | # {title}
70 |
71 | {description}
72 |
73 | ## Links
74 | {formatted_links if links else "No links found."}
75 | """.strip()
76 |
77 | # Return full content
78 | markdown_content = data.get("markdown", "No content available")
79 | return f"""
80 | # {title}
81 |
82 | {description}
83 |
84 | ## Content
85 |
86 | {markdown_content}
87 | """.strip()
88 |
89 | @mcp.tool()
90 | async def scrape_advanced(
91 | url: str,
92 | mobile: bool = False,
93 | include_raw_html: bool = False,
94 | wait_time: Optional[int] = None,
95 | custom_headers: Optional[Dict[str, str]] = None
96 | ) -> str:
97 | """Advanced web scraping with additional options.
98 |
99 | Args:
100 | url: The URL to scrape
101 | mobile: Whether to use mobile user agent
102 | include_raw_html: Whether to include raw HTML in response
103 | wait_time: Time to wait after page load in milliseconds
104 | custom_headers: Custom HTTP headers to send with request
105 | """
106 | # Configure parameters
107 | params = {
108 | "url": url,
109 | "formats": ["markdown", "html"],
110 | "onlyMainContent": True,
111 | "includeRawHtml": include_raw_html,
112 | "mobile": mobile
113 | }
114 |
115 | if wait_time is not None:
116 | params["waitFor"] = wait_time
117 |
118 | if custom_headers is not None:
119 | params["headers"] = custom_headers
120 |
121 | # Make the API call
122 | result = await make_scrape_request(params)
123 |
124 | if not result.get("success", False):
125 | error_msg = result.get("error", "Unknown error")
126 | return f"Error scraping {url}: {error_msg}"
127 |
128 | data = result.get("data", {})
129 | metadata = data.get("metadata", {})
130 | title = metadata.get("title", "No title")
131 | description = metadata.get("description", "")
132 | markdown_content = data.get("markdown", "No content available")
133 |
134 | return f"""
135 | # {title}
136 |
137 | {description}
138 |
139 | ## Content
140 |
141 | {markdown_content}
142 | """.strip()
143 |
144 | if __name__ == "__main__":
145 | # Initialize and run the server
146 | mcp.run(transport='stdio')
```