# Directory Structure
```
├── README.md
└── web-crawler-backup.ts
```
# Files
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
# Web Crawler MCP Server Deployment Guide
## Prerequisites
- Node.js (v18+)
- npm (v9+)
## Installation
1. Clone the repository:
```bash
git clone https://github.com/jitsmaster/web-crawler-mcp.git
cd web-crawler-mcp
```
2. Install dependencies:
```bash
npm install
```
3. Build the project:
```bash
npm run build
```
## Configuration
Create a `.env` file with the following environment variables:
```env
CRAWL_LINKS=false
MAX_DEPTH=3
REQUEST_DELAY=1000
TIMEOUT=5000
MAX_CONCURRENT=5
```
## Running the Server
Start the MCP server:
```bash
npm start
```
## MCP Configuration
Add the following to your MCP settings file:
```json
{
"mcpServers": {
"web-crawler": {
"command": "node",
"args": ["/path/to/web-crawler/build/index.js"],
"env": {
"CRAWL_LINKS": "false",
"MAX_DEPTH": "3",
"REQUEST_DELAY": "1000",
"TIMEOUT": "5000",
"MAX_CONCURRENT": "5"
}
}
}
}
```
## Usage
The server provides a `crawl` tool that can be accessed through MCP. Example usage:
```json
{
"url": "https://example.com",
"depth": 1
}
```
## Configuration Options
| Environment Variable | Default | Description |
|----------------------|---------|-------------|
| CRAWL_LINKS | false | Whether to follow links |
| MAX_DEPTH | 3 | Maximum crawl depth |
| REQUEST_DELAY | 1000 | Delay between requests (ms) |
| TIMEOUT | 5000 | Request timeout (ms) |
| MAX_CONCURRENT | 5 | Maximum concurrent requests |
```
--------------------------------------------------------------------------------
/web-crawler-backup.ts:
--------------------------------------------------------------------------------
```typescript
#!/usr/bin/env node
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import {
CallToolRequestSchema,
ErrorCode,
ListResourcesRequestSchema,
ListResourceTemplatesRequestSchema,
ListToolsRequestSchema,
McpError,
ReadResourceRequestSchema,
} from '@modelcontextprotocol/sdk/types.js';
import axios from 'axios';
import * as cheerio from 'cheerio';
import RobotsParserModule from 'robots-parser';
const RobotsParser = RobotsParserModule as unknown as {
default: (robotsUrl: string, content: string) => any;
};
import pLimit from 'p-limit';
type Limit = ReturnType<typeof pLimit>;
interface CrawlerConfig {
crawlLinks: boolean;
maxDepth: number;
requestDelay: number;
timeout: number;
maxConcurrent: number;
}
interface CrawlResult {
url: string;
content: string;
links: string[];
}
class WebCrawlerServer {
private server: Server;
private config: CrawlerConfig;
private limit: Limit;
private visitedUrls: Set<string> = new Set();
constructor() {
this.config = {
crawlLinks: process.env.CRAWL_LINKS === 'true',
maxDepth: parseInt(process.env.MAX_DEPTH || '3'),
requestDelay: parseInt(process.env.REQUEST_DELAY || '1000'),
timeout: parseInt(process.env.TIMEOUT || '5000'),
maxConcurrent: parseInt(process.env.MAX_CONCURRENT || '5')
};
this.limit = pLimit(this.config.maxConcurrent);
this.server = new Server(
{
name: 'web-crawler',
version: '0.1.0',
},
{
capabilities: {
resources: {},
tools: {},
},
}
);
this.setupResourceHandlers();
this.setupToolHandlers();
this.server.onerror = (error) => console.error('[MCP Error]', error);
process.on('SIGINT', async () => {
await this.server.close();
process.exit(0);
});
}
private async fetchRobotsTxt(url: string): Promise<any | null> {
try {
const robotsUrl = new URL('/robots.txt', url).toString();
const response = await axios.get(robotsUrl, { timeout: this.config.timeout });
return RobotsParser.default(robotsUrl, response.data);
} catch (error) {
return null;
}
}
private async crawlPage(url: string, depth = 0): Promise<CrawlResult> {
if (depth > this.config.maxDepth || this.visitedUrls.has(url)) {
return { url, content: '', links: [] };
}
this.visitedUrls.add(url);
try {
// Check robots.txt
const robots = await this.fetchRobotsTxt(url);
if (robots && !robots.isAllowed(url, 'MCPCrawler')) {
return { url, content: '', links: [] };
}
// Fetch page content
const response = await this.limit(() =>
axios.get(url, {
timeout: this.config.timeout,
headers: {
'User-Agent': 'MCPCrawler/1.0'
}
})
);
const $ = cheerio.load(response.data);
const content = $('body').text().replace(/\s+/g, ' ').trim();
// Extract links if configured
const links = this.config.crawlLinks ?
$('a')
.map((_i: number, el: any) => $(el).attr('href'))
.get()
.filter((link: string | undefined): link is string => !!link)
.map((link: string) => new URL(link, url).toString())
: [];
// Delay between requests
await new Promise(resolve => setTimeout(resolve, this.config.requestDelay));
return { url, content, links };
} catch (error) {
console.error(`Error crawling ${url}:`, error);
return { url, content: '', links: [] };
}
}
private setupResourceHandlers() {
this.server.setRequestHandler(ListResourcesRequestSchema, async () => ({
resources: []
}));
this.server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => ({
resourceTemplates: []
}));
this.server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
throw new McpError(ErrorCode.MethodNotFound, 'Resources not implemented');
});
}
private setupToolHandlers() {
this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: [
{
name: 'crawl',
description: 'Crawl a web page and extract content',
inputSchema: {
type: 'object',
properties: {
url: {
type: 'string',
format: 'uri',
description: 'URL to crawl'
},
depth: {
type: 'number',
description: 'Maximum crawl depth',
minimum: 0,
maximum: this.config.maxDepth
}
},
required: ['url']
}
}
]
}));
this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
if (request.params.name !== 'crawl') {
throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
}
const { url, depth = 0 } = request.params.arguments as { url: string; depth?: number };
if (typeof url !== 'string') {
throw new McpError(ErrorCode.InvalidParams, 'Invalid URL parameter');
}
try {
const result = await this.crawlPage(url, depth);
return {
content: [{
type: 'text',
text: JSON.stringify({
url: result.url,
content: result.content,
links: result.links
}, null, 2)
}]
};
} catch (error) {
return {
content: [{
type: 'text',
text: `Crawl failed: ${error instanceof Error ? error.message : 'Unknown error'}`
}],
isError: true
};
}
});
}
async run() {
const transport = new StdioServerTransport();
await this.server.connect(transport);
console.error('Web Crawler MCP server running on stdio');
}
}
const server = new WebCrawlerServer();
server.run().catch(console.error);
```