# Directory Structure
```
├── README.md
└── web-crawler-backup.ts
```
# Files
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # Web Crawler MCP Server Deployment Guide
2 |
3 | ## Prerequisites
4 | - Node.js (v18+)
5 | - npm (v9+)
6 |
7 | ## Installation
8 | 1. Clone the repository:
9 | ```bash
10 | git clone https://github.com/jitsmaster/web-crawler-mcp.git
11 | cd web-crawler-mcp
12 | ```
13 |
14 | 2. Install dependencies:
15 | ```bash
16 | npm install
17 | ```
18 |
19 | 3. Build the project:
20 | ```bash
21 | npm run build
22 | ```
23 |
24 | ## Configuration
25 | Create a `.env` file with the following environment variables:
26 |
27 | ```env
28 | CRAWL_LINKS=false
29 | MAX_DEPTH=3
30 | REQUEST_DELAY=1000
31 | TIMEOUT=5000
32 | MAX_CONCURRENT=5
33 | ```
34 |
35 | ## Running the Server
36 | Start the MCP server:
37 | ```bash
38 | npm start
39 | ```
40 |
41 | ## MCP Configuration
42 | Add the following to your MCP settings file:
43 |
44 | ```json
45 | {
46 | "mcpServers": {
47 | "web-crawler": {
48 | "command": "node",
49 | "args": ["/path/to/web-crawler/build/index.js"],
50 | "env": {
51 | "CRAWL_LINKS": "false",
52 | "MAX_DEPTH": "3",
53 | "REQUEST_DELAY": "1000",
54 | "TIMEOUT": "5000",
55 | "MAX_CONCURRENT": "5"
56 | }
57 | }
58 | }
59 | }
60 | ```
61 |
62 | ## Usage
63 | The server provides a `crawl` tool that can be accessed through MCP. Example usage:
64 |
65 | ```json
66 | {
67 | "url": "https://example.com",
68 | "depth": 1
69 | }
70 | ```
71 |
72 | ## Configuration Options
73 | | Environment Variable | Default | Description |
74 | |----------------------|---------|-------------|
75 | | CRAWL_LINKS | false | Whether to follow links |
76 | | MAX_DEPTH | 3 | Maximum crawl depth |
77 | | REQUEST_DELAY | 1000 | Delay between requests (ms) |
78 | | TIMEOUT | 5000 | Request timeout (ms) |
79 | | MAX_CONCURRENT | 5 | Maximum concurrent requests |
80 |
```
--------------------------------------------------------------------------------
/web-crawler-backup.ts:
--------------------------------------------------------------------------------
```typescript
1 | #!/usr/bin/env node
2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
3 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4 | import {
5 | CallToolRequestSchema,
6 | ErrorCode,
7 | ListResourcesRequestSchema,
8 | ListResourceTemplatesRequestSchema,
9 | ListToolsRequestSchema,
10 | McpError,
11 | ReadResourceRequestSchema,
12 | } from '@modelcontextprotocol/sdk/types.js';
13 | import axios from 'axios';
14 | import * as cheerio from 'cheerio';
15 | import RobotsParserModule from 'robots-parser';
16 | const RobotsParser = RobotsParserModule as unknown as {
17 | default: (robotsUrl: string, content: string) => any;
18 | };
19 | import pLimit from 'p-limit';
20 | type Limit = ReturnType<typeof pLimit>;
21 |
22 | interface CrawlerConfig {
23 | crawlLinks: boolean;
24 | maxDepth: number;
25 | requestDelay: number;
26 | timeout: number;
27 | maxConcurrent: number;
28 | }
29 |
30 | interface CrawlResult {
31 | url: string;
32 | content: string;
33 | links: string[];
34 | }
35 |
36 | class WebCrawlerServer {
37 | private server: Server;
38 | private config: CrawlerConfig;
39 | private limit: Limit;
40 | private visitedUrls: Set<string> = new Set();
41 |
42 | constructor() {
43 | this.config = {
44 | crawlLinks: process.env.CRAWL_LINKS === 'true',
45 | maxDepth: parseInt(process.env.MAX_DEPTH || '3'),
46 | requestDelay: parseInt(process.env.REQUEST_DELAY || '1000'),
47 | timeout: parseInt(process.env.TIMEOUT || '5000'),
48 | maxConcurrent: parseInt(process.env.MAX_CONCURRENT || '5')
49 | };
50 |
51 | this.limit = pLimit(this.config.maxConcurrent);
52 |
53 | this.server = new Server(
54 | {
55 | name: 'web-crawler',
56 | version: '0.1.0',
57 | },
58 | {
59 | capabilities: {
60 | resources: {},
61 | tools: {},
62 | },
63 | }
64 | );
65 |
66 | this.setupResourceHandlers();
67 | this.setupToolHandlers();
68 |
69 | this.server.onerror = (error) => console.error('[MCP Error]', error);
70 | process.on('SIGINT', async () => {
71 | await this.server.close();
72 | process.exit(0);
73 | });
74 | }
75 |
76 | private async fetchRobotsTxt(url: string): Promise<any | null> {
77 | try {
78 | const robotsUrl = new URL('/robots.txt', url).toString();
79 | const response = await axios.get(robotsUrl, { timeout: this.config.timeout });
80 | return RobotsParser.default(robotsUrl, response.data);
81 | } catch (error) {
82 | return null;
83 | }
84 | }
85 |
86 | private async crawlPage(url: string, depth = 0): Promise<CrawlResult> {
87 | if (depth > this.config.maxDepth || this.visitedUrls.has(url)) {
88 | return { url, content: '', links: [] };
89 | }
90 |
91 | this.visitedUrls.add(url);
92 |
93 | try {
94 | // Check robots.txt
95 | const robots = await this.fetchRobotsTxt(url);
96 | if (robots && !robots.isAllowed(url, 'MCPCrawler')) {
97 | return { url, content: '', links: [] };
98 | }
99 |
100 | // Fetch page content
101 | const response = await this.limit(() =>
102 | axios.get(url, {
103 | timeout: this.config.timeout,
104 | headers: {
105 | 'User-Agent': 'MCPCrawler/1.0'
106 | }
107 | })
108 | );
109 |
110 | const $ = cheerio.load(response.data);
111 | const content = $('body').text().replace(/\s+/g, ' ').trim();
112 |
113 | // Extract links if configured
114 | const links = this.config.crawlLinks ?
115 | $('a')
116 | .map((_i: number, el: any) => $(el).attr('href'))
117 | .get()
118 | .filter((link: string | undefined): link is string => !!link)
119 | .map((link: string) => new URL(link, url).toString())
120 | : [];
121 |
122 | // Delay between requests
123 | await new Promise(resolve => setTimeout(resolve, this.config.requestDelay));
124 |
125 | return { url, content, links };
126 | } catch (error) {
127 | console.error(`Error crawling ${url}:`, error);
128 | return { url, content: '', links: [] };
129 | }
130 | }
131 |
132 | private setupResourceHandlers() {
133 | this.server.setRequestHandler(ListResourcesRequestSchema, async () => ({
134 | resources: []
135 | }));
136 |
137 | this.server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => ({
138 | resourceTemplates: []
139 | }));
140 |
141 | this.server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
142 | throw new McpError(ErrorCode.MethodNotFound, 'Resources not implemented');
143 | });
144 | }
145 |
146 | private setupToolHandlers() {
147 | this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
148 | tools: [
149 | {
150 | name: 'crawl',
151 | description: 'Crawl a web page and extract content',
152 | inputSchema: {
153 | type: 'object',
154 | properties: {
155 | url: {
156 | type: 'string',
157 | format: 'uri',
158 | description: 'URL to crawl'
159 | },
160 | depth: {
161 | type: 'number',
162 | description: 'Maximum crawl depth',
163 | minimum: 0,
164 | maximum: this.config.maxDepth
165 | }
166 | },
167 | required: ['url']
168 | }
169 | }
170 | ]
171 | }));
172 |
173 | this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
174 | if (request.params.name !== 'crawl') {
175 | throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
176 | }
177 |
178 | const { url, depth = 0 } = request.params.arguments as { url: string; depth?: number };
179 | if (typeof url !== 'string') {
180 | throw new McpError(ErrorCode.InvalidParams, 'Invalid URL parameter');
181 | }
182 |
183 | try {
184 | const result = await this.crawlPage(url, depth);
185 | return {
186 | content: [{
187 | type: 'text',
188 | text: JSON.stringify({
189 | url: result.url,
190 | content: result.content,
191 | links: result.links
192 | }, null, 2)
193 | }]
194 | };
195 | } catch (error) {
196 | return {
197 | content: [{
198 | type: 'text',
199 | text: `Crawl failed: ${error instanceof Error ? error.message : 'Unknown error'}`
200 | }],
201 | isError: true
202 | };
203 | }
204 | });
205 | }
206 |
207 | async run() {
208 | const transport = new StdioServerTransport();
209 | await this.server.connect(transport);
210 | console.error('Web Crawler MCP server running on stdio');
211 | }
212 | }
213 |
214 | const server = new WebCrawlerServer();
215 | server.run().catch(console.error);
216 |
```