jitsmaster/webscrapemcpserver # codebase.md

# Directory Structure

```
├── README.md
└── web-crawler-backup.ts
```

# Files

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
 1 | # Web Crawler MCP Server Deployment Guide
 2 | 
 3 | ## Prerequisites
 4 | - Node.js (v18+)
 5 | - npm (v9+)
 6 | 
 7 | ## Installation
 8 | 1. Clone the repository:
 9 |    ```bash
10 |    git clone https://github.com/jitsmaster/web-crawler-mcp.git
11 |    cd web-crawler-mcp
12 |    ```
13 | 
14 | 2. Install dependencies:
15 |    ```bash
16 |    npm install
17 |    ```
18 | 
19 | 3. Build the project:
20 |    ```bash
21 |    npm run build
22 |    ```
23 | 
24 | ## Configuration
25 | Create a `.env` file with the following environment variables:
26 | 
27 | ```env
28 | CRAWL_LINKS=false
29 | MAX_DEPTH=3
30 | REQUEST_DELAY=1000
31 | TIMEOUT=5000
32 | MAX_CONCURRENT=5
33 | ```
34 | 
35 | ## Running the Server
36 | Start the MCP server:
37 | ```bash
38 | npm start
39 | ```
40 | 
41 | ## MCP Configuration
42 | Add the following to your MCP settings file:
43 | 
44 | ```json
45 | {
46 |   "mcpServers": {
47 |     "web-crawler": {
48 |       "command": "node",
49 |       "args": ["/path/to/web-crawler/build/index.js"],
50 |       "env": {
51 |         "CRAWL_LINKS": "false",
52 |         "MAX_DEPTH": "3",
53 |         "REQUEST_DELAY": "1000",
54 |         "TIMEOUT": "5000",
55 |         "MAX_CONCURRENT": "5"
56 |       }
57 |     }
58 |   }
59 | }
60 | ```
61 | 
62 | ## Usage
63 | The server provides a `crawl` tool that can be accessed through MCP. Example usage:
64 | 
65 | ```json
66 | {
67 |   "url": "https://example.com",
68 |   "depth": 1
69 | }
70 | ```
71 | 
72 | ## Configuration Options
73 | | Environment Variable | Default | Description |
74 | |----------------------|---------|-------------|
75 | | CRAWL_LINKS          | false   | Whether to follow links |
76 | | MAX_DEPTH            | 3       | Maximum crawl depth |
77 | | REQUEST_DELAY        | 1000    | Delay between requests (ms) |
78 | | TIMEOUT              | 5000    | Request timeout (ms) |
79 | | MAX_CONCURRENT       | 5       | Maximum concurrent requests |
80 | 
```

--------------------------------------------------------------------------------
/web-crawler-backup.ts:
--------------------------------------------------------------------------------

```typescript
  1 | #!/usr/bin/env node
  2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
  3 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
  4 | import {
  5 |   CallToolRequestSchema,
  6 |   ErrorCode,
  7 |   ListResourcesRequestSchema,
  8 |   ListResourceTemplatesRequestSchema,
  9 |   ListToolsRequestSchema,
 10 |   McpError,
 11 |   ReadResourceRequestSchema,
 12 | } from '@modelcontextprotocol/sdk/types.js';
 13 | import axios from 'axios';
 14 | import * as cheerio from 'cheerio';
 15 | import RobotsParserModule from 'robots-parser';
 16 | const RobotsParser = RobotsParserModule as unknown as {
 17 |   default: (robotsUrl: string, content: string) => any;
 18 | };
 19 | import pLimit from 'p-limit';
 20 | type Limit = ReturnType<typeof pLimit>;
 21 | 
 22 | interface CrawlerConfig {
 23 |   crawlLinks: boolean;
 24 |   maxDepth: number;
 25 |   requestDelay: number;
 26 |   timeout: number;
 27 |   maxConcurrent: number;
 28 | }
 29 | 
 30 | interface CrawlResult {
 31 |   url: string;
 32 |   content: string;
 33 |   links: string[];
 34 | }
 35 | 
 36 | class WebCrawlerServer {
 37 |   private server: Server;
 38 |   private config: CrawlerConfig;
 39 |   private limit: Limit;
 40 |   private visitedUrls: Set<string> = new Set();
 41 | 
 42 |   constructor() {
 43 |     this.config = {
 44 |       crawlLinks: process.env.CRAWL_LINKS === 'true',
 45 |       maxDepth: parseInt(process.env.MAX_DEPTH || '3'),
 46 |       requestDelay: parseInt(process.env.REQUEST_DELAY || '1000'),
 47 |       timeout: parseInt(process.env.TIMEOUT || '5000'),
 48 |       maxConcurrent: parseInt(process.env.MAX_CONCURRENT || '5')
 49 |     };
 50 | 
 51 |     this.limit = pLimit(this.config.maxConcurrent);
 52 | 
 53 |     this.server = new Server(
 54 |       {
 55 |         name: 'web-crawler',
 56 |         version: '0.1.0',
 57 |       },
 58 |       {
 59 |         capabilities: {
 60 |           resources: {},
 61 |           tools: {},
 62 |         },
 63 |       }
 64 |     );
 65 | 
 66 |     this.setupResourceHandlers();
 67 |     this.setupToolHandlers();
 68 |     
 69 |     this.server.onerror = (error) => console.error('[MCP Error]', error);
 70 |     process.on('SIGINT', async () => {
 71 |       await this.server.close();
 72 |       process.exit(0);
 73 |     });
 74 |   }
 75 | 
 76 |   private async fetchRobotsTxt(url: string): Promise<any | null> {
 77 |     try {
 78 |       const robotsUrl = new URL('/robots.txt', url).toString();
 79 |       const response = await axios.get(robotsUrl, { timeout: this.config.timeout });
 80 |       return RobotsParser.default(robotsUrl, response.data);
 81 |     } catch (error) {
 82 |       return null;
 83 |     }
 84 |   }
 85 | 
 86 |   private async crawlPage(url: string, depth = 0): Promise<CrawlResult> {
 87 |     if (depth > this.config.maxDepth || this.visitedUrls.has(url)) {
 88 |       return { url, content: '', links: [] };
 89 |     }
 90 | 
 91 |     this.visitedUrls.add(url);
 92 | 
 93 |     try {
 94 |       // Check robots.txt
 95 |       const robots = await this.fetchRobotsTxt(url);
 96 |       if (robots && !robots.isAllowed(url, 'MCPCrawler')) {
 97 |         return { url, content: '', links: [] };
 98 |       }
 99 | 
100 |       // Fetch page content
101 |       const response = await this.limit(() => 
102 |         axios.get(url, { 
103 |           timeout: this.config.timeout,
104 |           headers: {
105 |             'User-Agent': 'MCPCrawler/1.0'
106 |           }
107 |         })
108 |       );
109 | 
110 |       const $ = cheerio.load(response.data);
111 |       const content = $('body').text().replace(/\s+/g, ' ').trim();
112 |       
113 |       // Extract links if configured
114 |       const links = this.config.crawlLinks ? 
115 |         $('a')
116 |           .map((_i: number, el: any) => $(el).attr('href'))
117 |           .get()
118 |           .filter((link: string | undefined): link is string => !!link)
119 |           .map((link: string) => new URL(link, url).toString())
120 |         : [];
121 | 
122 |       // Delay between requests
123 |       await new Promise(resolve => setTimeout(resolve, this.config.requestDelay));
124 | 
125 |       return { url, content, links };
126 |     } catch (error) {
127 |       console.error(`Error crawling ${url}:`, error);
128 |       return { url, content: '', links: [] };
129 |     }
130 |   }
131 | 
132 |   private setupResourceHandlers() {
133 |     this.server.setRequestHandler(ListResourcesRequestSchema, async () => ({
134 |       resources: []
135 |     }));
136 | 
137 |     this.server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => ({
138 |       resourceTemplates: []
139 |     }));
140 | 
141 |     this.server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
142 |       throw new McpError(ErrorCode.MethodNotFound, 'Resources not implemented');
143 |     });
144 |   }
145 | 
146 |   private setupToolHandlers() {
147 |     this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
148 |       tools: [
149 |         {
150 |           name: 'crawl',
151 |           description: 'Crawl a web page and extract content',
152 |           inputSchema: {
153 |             type: 'object',
154 |             properties: {
155 |               url: {
156 |                 type: 'string',
157 |                 format: 'uri',
158 |                 description: 'URL to crawl'
159 |               },
160 |               depth: {
161 |                 type: 'number',
162 |                 description: 'Maximum crawl depth',
163 |                 minimum: 0,
164 |                 maximum: this.config.maxDepth
165 |               }
166 |             },
167 |             required: ['url']
168 |           }
169 |         }
170 |       ]
171 |     }));
172 | 
173 |     this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
174 |       if (request.params.name !== 'crawl') {
175 |         throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
176 |       }
177 | 
178 |       const { url, depth = 0 } = request.params.arguments as { url: string; depth?: number };
179 |       if (typeof url !== 'string') {
180 |         throw new McpError(ErrorCode.InvalidParams, 'Invalid URL parameter');
181 |       }
182 | 
183 |       try {
184 |         const result = await this.crawlPage(url, depth);
185 |         return {
186 |           content: [{
187 |             type: 'text',
188 |             text: JSON.stringify({
189 |               url: result.url,
190 |               content: result.content,
191 |               links: result.links
192 |             }, null, 2)
193 |           }]
194 |         };
195 |       } catch (error) {
196 |         return {
197 |           content: [{
198 |             type: 'text',
199 |             text: `Crawl failed: ${error instanceof Error ? error.message : 'Unknown error'}`
200 |           }],
201 |           isError: true
202 |         };
203 |       }
204 |     });
205 |   }
206 | 
207 |   async run() {
208 |     const transport = new StdioServerTransport();
209 |     await this.server.connect(transport);
210 |     console.error('Web Crawler MCP server running on stdio');
211 |   }
212 | }
213 | 
214 | const server = new WebCrawlerServer();
215 | server.run().catch(console.error);
216 | 
```