# Directory Structure
```
├── .gitignore
├── .python-version
├── main.py
├── pyproject.toml
├── README.md
├── tools
│ ├── crawl.py
│ ├── scrape.py
│ └── utils.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
```
1 | 3.13
2 |
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
1 | # Python-generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # Virtual environments
10 | .venv
11 | __pycache__/
12 |
13 | .DS_Store
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # Crawl4AI MCP Server
2 |
3 | A Model Context Protocol (MCP) server implementation that integrates Crawl4AI with Cursor AI, providing web scraping and crawling capabilities as tools for LLMs in Cursor Composer's agent mode.
4 |
5 | ## System Requirements
6 |
7 | Python 3.10 or higher installed.
8 |
9 | ## Current Features
10 |
11 | - Single page scraping
12 | - Website crawling
13 |
14 | ## Installation
15 |
16 | Basic setup instructions also available in the [Official Docs for MCP Server QuickStart](https://modelcontextprotocol.io/quickstart/server#why-claude-for-desktop-and-not-claude-ai).
17 |
18 | ### Set up your environment
19 |
20 | First, let's install `uv` and set up our Python project and environment:
21 |
22 | MacOS/Linux:
23 |
24 | ```bash
25 | curl -LsSf https://astral.sh/uv/install.sh | sh
26 | ```
27 |
28 | Windows:
29 |
30 | ```bash
31 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
32 | ```
33 |
34 | Make sure to restart your terminal afterwards to ensure that the uv command gets picked up.
35 |
36 | After that:
37 |
38 | 1. Clone the repository
39 |
40 | 2. Install dependencies using UV:
41 |
42 | ```bash
43 | # Navigate to the crawl4ai-mcp directory
44 | cd crawl4ai-mcp
45 |
46 | # Install dependencies (Only first time)
47 | uv venv
48 | uv sync
49 |
50 | # Activate the venv
51 | source .venv/bin/activate
52 |
53 | # Run the server
54 | python main.py
55 | ```
56 |
57 | 3. Add to Cursor's MCP Servers or Claude's MCP Servers
58 |
59 | You may need to put the full path to the uv executable in the command field. You can get this by running `which uv` on MacOS/Linux or `where uv` on Windows.
60 |
61 | ```json
62 | {
63 | "mcpServers": {
64 | "Crawl4AI": {
65 | "command": "uv",
66 | "args": [
67 | "--directory",
68 | "/ABSOLUTE/PATH/TO/PARENT/FOLDER/crawl4ai-mcp",
69 | "run",
70 | "main.py"
71 | ]
72 | }
73 | }
74 | }
75 | ```
76 |
77 | ## Tools Provided
78 |
79 | This MCP server exposes the following tools to the LLM:
80 |
81 | 1. **`scrape_webpage(url: str)`**
82 |
83 | - **Description:** Scrapes the content and metadata from a single webpage using Crawl4AI.
84 | - **Parameters:**
85 | - `url` (string, required): The URL of the webpage to scrape.
86 | - **Returns:** A list containing a `TextContent` object with the scraped content (primarily markdown) as JSON.
87 |
88 | 2. **`crawl_website(url: str, crawl_depth: int = 1, max_pages: int = 5)`**
89 | - **Description:** Crawls a website starting from the given URL up to a specified depth and page limit using Crawl4AI.
90 | - **Parameters:**
91 | - `url` (string, required): The starting URL to crawl.
92 | - `crawl_depth` (integer, optional, default: 1): The maximum depth to crawl relative to the starting URL.
93 | - `max_pages` (integer, optional, default: 5): The maximum number of pages to scrape during the crawl.
94 | - **Returns:** A list containing a `TextContent` object with a JSON array of results for the crawled pages (including URL, success status, markdown content, or error).
95 |
```
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
1 | [project]
2 | name = "crawl4ai-mcp"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.13"
7 | dependencies = [
8 | "crawl4ai>=0.5.0.post8",
9 | "httpx>=0.28.1",
10 | "mcp[cli]>=1.6.0",
11 | ]
12 |
13 | [tool.uv.sources]
14 | crawl4ai = { git = "https://github.com/unclecode/crawl4ai.git", rev = "2025-MAR-ALPHA-1" }
15 | # We need the above for the bug-fixes done in the crawl4ai-mcp branch
16 |
```
--------------------------------------------------------------------------------
/tools/utils.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 |
3 |
4 | def validate_and_normalize_url(url: str) -> str | None:
5 | """Validate and normalize a URL.
6 |
7 | Args:
8 | url: The URL string to validate.
9 |
10 | Returns:
11 | The normalized URL with https scheme if valid, otherwise None.
12 | """
13 | # Simple validation for domains/subdomains with http(s)
14 | # Allows for optional paths
15 | url_pattern = re.compile(
16 | r"^(?:https?://)?(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
17 | r"localhost|" # localhost...
18 | r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
19 | r"(?::\d+)?" # optional port
20 | r"(?:/?|[/?]\S+)$",
21 | re.IGNORECASE,
22 | )
23 |
24 | if not url_pattern.match(url):
25 | return None
26 |
27 | # Add https:// if missing
28 | if not url.startswith("http://") and not url.startswith("https://"):
29 | url = f"https://{url}"
30 |
31 | return url
32 |
```
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
```python
1 | import mcp.types as types
2 | from mcp.server.fastmcp import FastMCP
3 | from tools.scrape import scrape_url
4 | from tools.crawl import crawl_website_async
5 |
6 | # Initialize FastMCP server
7 | mcp = FastMCP("crawl4ai")
8 |
9 |
10 | @mcp.tool()
11 | async def scrape_webpage(
12 | url: str,
13 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
14 | """
15 | Scrape content and metadata from a single webpage using Crawl4AI.
16 |
17 | Args:
18 | url: The URL of the webpage to scrape
19 |
20 | Returns:
21 | List containing TextContent with the result as JSON.
22 | """
23 | return await scrape_url(url)
24 |
25 |
26 | @mcp.tool()
27 | async def crawl_website(
28 | url: str,
29 | crawl_depth: int = 1,
30 | max_pages: int = 5,
31 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
32 | """
33 | Crawl a website starting from the given URL up to a specified depth and page limit.
34 |
35 | Args:
36 | url: The starting URL to crawl.
37 | crawl_depth: The maximum depth to crawl relative to the starting URL (default: 1).
38 | max_pages: The maximum number of pages to scrape during the crawl (default: 5).
39 |
40 | Returns:
41 | List containing TextContent with a JSON array of results for crawled pages.
42 | """
43 | return await crawl_website_async(url, crawl_depth, max_pages)
44 |
45 |
46 | if __name__ == "__main__":
47 | # Initialize and run the server
48 | mcp.run(transport="stdio")
49 |
```
--------------------------------------------------------------------------------
/tools/scrape.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import mcp.types as types
3 | from typing import Any, List
4 | import json
5 | import re
6 | from crawl4ai import AsyncWebCrawler, CacheMode
7 | from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
8 |
9 |
10 | async def scrape_url(url: str) -> List[Any]:
11 | """Scrape a webpage using crawl4ai with simple implementation.
12 |
13 | Args:
14 | url: The URL to scrape
15 |
16 | Returns:
17 | A list containing TextContent object with the result as JSON
18 | """
19 |
20 | try:
21 | # Simple validation for domains/subdomains with http(s)
22 | url_pattern = re.compile(
23 | r"^(?:https?://)?(?:[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?\.)+[A-Za-z]{2,}(?:/[^/\s]*)*$"
24 | )
25 |
26 | if not url_pattern.match(url):
27 | return [
28 | types.TextContent(
29 | type="text",
30 | text=json.dumps(
31 | {
32 | "success": False,
33 | "url": url,
34 | "error": "Invalid URL format",
35 | }
36 | ),
37 | )
38 | ]
39 |
40 | # Add https:// if missing
41 | if not url.startswith("http://") and not url.startswith("https://"):
42 | url = f"https://{url}"
43 |
44 | # Use default configurations with minimal customization
45 | browser_config = BrowserConfig(
46 | browser_type="chromium",
47 | headless=True,
48 | ignore_https_errors=True,
49 | verbose=False,
50 | extra_args=[
51 | "--no-sandbox",
52 | "--disable-setuid-sandbox",
53 | "--disable-dev-shm-usage",
54 | ],
55 | )
56 | run_config = CrawlerRunConfig(
57 | cache_mode=CacheMode.BYPASS,
58 | verbose=False,
59 | page_timeout=30 * 1000, # Convert to milliseconds
60 | )
61 |
62 | async with AsyncWebCrawler(config=browser_config) as crawler:
63 | result = await asyncio.wait_for(
64 | crawler.arun(
65 | url=url,
66 | config=run_config,
67 | ),
68 | timeout=30,
69 | )
70 |
71 | # Create response in the format requested
72 | return [
73 | types.TextContent(
74 | type="text", text=json.dumps({"markdown": result.markdown})
75 | )
76 | ]
77 |
78 | except Exception as e:
79 | return [
80 | types.TextContent(
81 | type="text",
82 | text=json.dumps({"success": False, "url": url, "error": str(e)}),
83 | )
84 | ]
85 |
```
--------------------------------------------------------------------------------
/tools/crawl.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import mcp.types as types
3 | from typing import Any, List
4 | import json
5 | from crawl4ai import AsyncWebCrawler, CacheMode, CrawlResult
6 | from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
7 | from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
8 | from .utils import validate_and_normalize_url
9 |
10 | CRAWL_TIMEOUT_SECONDS = 300 # Overall timeout for the crawl operation
11 |
12 |
13 | async def crawl_website_async(url: str, crawl_depth: int, max_pages: int) -> List[Any]:
14 | """Crawl a website using crawl4ai.
15 |
16 | Args:
17 | url: The starting URL to crawl.
18 | crawl_depth: The maximum depth to crawl.
19 | max_pages: The maximum number of pages to crawl.
20 |
21 | Returns:
22 | A list containing TextContent objects with the results as JSON.
23 | """
24 |
25 | normalized_url = validate_and_normalize_url(url)
26 | if not normalized_url:
27 | return [
28 | types.TextContent(
29 | type="text",
30 | text=json.dumps(
31 | {
32 | "success": False,
33 | "url": url,
34 | "error": "Invalid URL format",
35 | }
36 | ),
37 | )
38 | ]
39 |
40 | try:
41 | # Use default configurations with minimal customization
42 | browser_config = BrowserConfig(
43 | browser_type="chromium",
44 | headless=True,
45 | ignore_https_errors=True,
46 | verbose=False,
47 | extra_args=[
48 | "--no-sandbox",
49 | "--disable-setuid-sandbox",
50 | "--disable-dev-shm-usage",
51 | ],
52 | )
53 |
54 | # 1. Create the deep crawl strategy with depth and page limits
55 | crawl_strategy = BFSDeepCrawlStrategy(
56 | max_depth=crawl_depth, max_pages=max_pages
57 | )
58 |
59 | # 2. Create the run config, passing the strategy
60 | run_config = CrawlerRunConfig(
61 | cache_mode=CacheMode.BYPASS,
62 | verbose=False,
63 | page_timeout=30 * 1000, # 30 seconds per page
64 | deep_crawl_strategy=crawl_strategy, # Pass the strategy here
65 | )
66 |
67 | results_list = []
68 | async with AsyncWebCrawler(config=browser_config) as crawler:
69 | # 3. Use arun and wrap in asyncio.wait_for for overall timeout
70 | crawl_results: List[CrawlResult] = await asyncio.wait_for(
71 | crawler.arun(
72 | url=normalized_url,
73 | config=run_config,
74 | ),
75 | timeout=CRAWL_TIMEOUT_SECONDS,
76 | )
77 |
78 | # Process results, checking 'success' attribute
79 | for result in crawl_results:
80 | if result.success: # Check .success instead of .status
81 | results_list.append(
82 | {
83 | "url": result.url,
84 | "success": True,
85 | "markdown": result.markdown,
86 | }
87 | )
88 | else:
89 | results_list.append(
90 | {
91 | "url": result.url,
92 | "success": False,
93 | "error": result.error, # Assume .error holds the message
94 | }
95 | )
96 |
97 | # Return a single TextContent with a JSON array of results
98 | return [
99 | types.TextContent(
100 | type="text", text=json.dumps({"results": results_list})
101 | )
102 | ]
103 |
104 | except asyncio.TimeoutError:
105 | return [
106 | types.TextContent(
107 | type="text",
108 | text=json.dumps(
109 | {
110 | "success": False,
111 | "url": normalized_url,
112 | "error": f"Crawl operation timed out after {CRAWL_TIMEOUT_SECONDS} seconds.",
113 | }
114 | ),
115 | )
116 | ]
117 | except Exception as e:
118 | return [
119 | types.TextContent(
120 | type="text",
121 | text=json.dumps(
122 | {"success": False, "url": normalized_url, "error": str(e)}
123 | ),
124 | )
125 | ]
126 |
```