jitsmaster/webscrapemcpserver # codebase.md

# Directory Structure

```
├── README.md
└── web-crawler-backup.ts
```

# Files

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
# Web Crawler MCP Server Deployment Guide

## Prerequisites
- Node.js (v18+)
- npm (v9+)

## Installation
1. Clone the repository:
   ```bash
   git clone https://github.com/jitsmaster/web-crawler-mcp.git
   cd web-crawler-mcp
   ```

2. Install dependencies:
   ```bash
   npm install
   ```

3. Build the project:
   ```bash
   npm run build
   ```

## Configuration
Create a `.env` file with the following environment variables:

```env
CRAWL_LINKS=false
MAX_DEPTH=3
REQUEST_DELAY=1000
TIMEOUT=5000
MAX_CONCURRENT=5
```

## Running the Server
Start the MCP server:
```bash
npm start
```

## MCP Configuration
Add the following to your MCP settings file:

```json
{
  "mcpServers": {
    "web-crawler": {
      "command": "node",
      "args": ["/path/to/web-crawler/build/index.js"],
      "env": {
        "CRAWL_LINKS": "false",
        "MAX_DEPTH": "3",
        "REQUEST_DELAY": "1000",
        "TIMEOUT": "5000",
        "MAX_CONCURRENT": "5"
      }
    }
  }
}
```

## Usage
The server provides a `crawl` tool that can be accessed through MCP. Example usage:

```json
{
  "url": "https://example.com",
  "depth": 1
}
```

## Configuration Options
| Environment Variable | Default | Description |
|----------------------|---------|-------------|
| CRAWL_LINKS          | false   | Whether to follow links |
| MAX_DEPTH            | 3       | Maximum crawl depth |
| REQUEST_DELAY        | 1000    | Delay between requests (ms) |
| TIMEOUT              | 5000    | Request timeout (ms) |
| MAX_CONCURRENT       | 5       | Maximum concurrent requests |

```

--------------------------------------------------------------------------------
/web-crawler-backup.ts:
--------------------------------------------------------------------------------

```typescript
#!/usr/bin/env node
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import {
  CallToolRequestSchema,
  ErrorCode,
  ListResourcesRequestSchema,
  ListResourceTemplatesRequestSchema,
  ListToolsRequestSchema,
  McpError,
  ReadResourceRequestSchema,
} from '@modelcontextprotocol/sdk/types.js';
import axios from 'axios';
import * as cheerio from 'cheerio';
import RobotsParserModule from 'robots-parser';
const RobotsParser = RobotsParserModule as unknown as {
  default: (robotsUrl: string, content: string) => any;
};
import pLimit from 'p-limit';
type Limit = ReturnType<typeof pLimit>;

interface CrawlerConfig {
  crawlLinks: boolean;
  maxDepth: number;
  requestDelay: number;
  timeout: number;
  maxConcurrent: number;
}

interface CrawlResult {
  url: string;
  content: string;
  links: string[];
}

class WebCrawlerServer {
  private server: Server;
  private config: CrawlerConfig;
  private limit: Limit;
  private visitedUrls: Set<string> = new Set();

  constructor() {
    this.config = {
      crawlLinks: process.env.CRAWL_LINKS === 'true',
      maxDepth: parseInt(process.env.MAX_DEPTH || '3'),
      requestDelay: parseInt(process.env.REQUEST_DELAY || '1000'),
      timeout: parseInt(process.env.TIMEOUT || '5000'),
      maxConcurrent: parseInt(process.env.MAX_CONCURRENT || '5')
    };

    this.limit = pLimit(this.config.maxConcurrent);

    this.server = new Server(
      {
        name: 'web-crawler',
        version: '0.1.0',
      },
      {
        capabilities: {
          resources: {},
          tools: {},
        },
      }
    );

    this.setupResourceHandlers();
    this.setupToolHandlers();
    
    this.server.onerror = (error) => console.error('[MCP Error]', error);
    process.on('SIGINT', async () => {
      await this.server.close();
      process.exit(0);
    });
  }

  private async fetchRobotsTxt(url: string): Promise<any | null> {
    try {
      const robotsUrl = new URL('/robots.txt', url).toString();
      const response = await axios.get(robotsUrl, { timeout: this.config.timeout });
      return RobotsParser.default(robotsUrl, response.data);
    } catch (error) {
      return null;
    }
  }

  private async crawlPage(url: string, depth = 0): Promise<CrawlResult> {
    if (depth > this.config.maxDepth || this.visitedUrls.has(url)) {
      return { url, content: '', links: [] };
    }

    this.visitedUrls.add(url);

    try {
      // Check robots.txt
      const robots = await this.fetchRobotsTxt(url);
      if (robots && !robots.isAllowed(url, 'MCPCrawler')) {
        return { url, content: '', links: [] };
      }

      // Fetch page content
      const response = await this.limit(() => 
        axios.get(url, { 
          timeout: this.config.timeout,
          headers: {
            'User-Agent': 'MCPCrawler/1.0'
          }
        })
      );

      const $ = cheerio.load(response.data);
      const content = $('body').text().replace(/\s+/g, ' ').trim();
      
      // Extract links if configured
      const links = this.config.crawlLinks ? 
        $('a')
          .map((_i: number, el: any) => $(el).attr('href'))
          .get()
          .filter((link: string | undefined): link is string => !!link)
          .map((link: string) => new URL(link, url).toString())
        : [];

      // Delay between requests
      await new Promise(resolve => setTimeout(resolve, this.config.requestDelay));

      return { url, content, links };
    } catch (error) {
      console.error(`Error crawling ${url}:`, error);
      return { url, content: '', links: [] };
    }
  }

  private setupResourceHandlers() {
    this.server.setRequestHandler(ListResourcesRequestSchema, async () => ({
      resources: []
    }));

    this.server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => ({
      resourceTemplates: []
    }));

    this.server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
      throw new McpError(ErrorCode.MethodNotFound, 'Resources not implemented');
    });
  }

  private setupToolHandlers() {
    this.server.setRequestHandler(ListToolsRequestSchema, async () => ({
      tools: [
        {
          name: 'crawl',
          description: 'Crawl a web page and extract content',
          inputSchema: {
            type: 'object',
            properties: {
              url: {
                type: 'string',
                format: 'uri',
                description: 'URL to crawl'
              },
              depth: {
                type: 'number',
                description: 'Maximum crawl depth',
                minimum: 0,
                maximum: this.config.maxDepth
              }
            },
            required: ['url']
          }
        }
      ]
    }));

    this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
      if (request.params.name !== 'crawl') {
        throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
      }

      const { url, depth = 0 } = request.params.arguments as { url: string; depth?: number };
      if (typeof url !== 'string') {
        throw new McpError(ErrorCode.InvalidParams, 'Invalid URL parameter');
      }

      try {
        const result = await this.crawlPage(url, depth);
        return {
          content: [{
            type: 'text',
            text: JSON.stringify({
              url: result.url,
              content: result.content,
              links: result.links
            }, null, 2)
          }]
        };
      } catch (error) {
        return {
          content: [{
            type: 'text',
            text: `Crawl failed: ${error instanceof Error ? error.message : 'Unknown error'}`
          }],
          isError: true
        };
      }
    });
  }

  async run() {
    const transport = new StdioServerTransport();
    await this.server.connect(transport);
    console.error('Web Crawler MCP server running on stdio');
  }
}

const server = new WebCrawlerServer();
server.run().catch(console.error);

```