This is page 1 of 3. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?page={x} to view the full context. # Directory Structure ``` ├── .env.example ├── .github │ ├── CI.md │ ├── copilot-instructions.md │ └── workflows │ └── ci.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── CHANGELOG.md ├── eslint.config.mjs ├── jest.config.cjs ├── jest.setup.cjs ├── LICENSE ├── package-lock.json ├── package.json ├── README.md ├── src │ ├── __tests__ │ │ ├── crawl.test.ts │ │ ├── crawl4ai-service.network.test.ts │ │ ├── crawl4ai-service.test.ts │ │ ├── handlers │ │ │ ├── crawl-handlers.test.ts │ │ │ ├── parameter-combinations.test.ts │ │ │ ├── screenshot-saving.test.ts │ │ │ ├── session-handlers.test.ts │ │ │ └── utility-handlers.test.ts │ │ ├── index.cli.test.ts │ │ ├── index.npx.test.ts │ │ ├── index.server.test.ts │ │ ├── index.test.ts │ │ ├── integration │ │ │ ├── batch-crawl.integration.test.ts │ │ │ ├── capture-screenshot.integration.test.ts │ │ │ ├── crawl-advanced.integration.test.ts │ │ │ ├── crawl-handlers.integration.test.ts │ │ │ ├── crawl-recursive.integration.test.ts │ │ │ ├── crawl.integration.test.ts │ │ │ ├── execute-js.integration.test.ts │ │ │ ├── extract-links.integration.test.ts │ │ │ ├── extract-with-llm.integration.test.ts │ │ │ ├── generate-pdf.integration.test.ts │ │ │ ├── get-html.integration.test.ts │ │ │ ├── get-markdown.integration.test.ts │ │ │ ├── parse-sitemap.integration.test.ts │ │ │ ├── session-management.integration.test.ts │ │ │ ├── smart-crawl.integration.test.ts │ │ │ └── test-utils.ts │ │ ├── request-handler.test.ts │ │ ├── schemas │ │ │ └── validation-edge-cases.test.ts │ │ ├── types │ │ │ └── mocks.ts │ │ └── utils │ │ └── javascript-validation.test.ts │ ├── crawl4ai-service.ts │ ├── handlers │ │ ├── base-handler.ts │ │ ├── content-handlers.ts │ │ ├── crawl-handlers.ts │ │ ├── session-handlers.ts │ │ └── utility-handlers.ts │ ├── index.ts │ ├── schemas │ │ ├── helpers.ts │ │ └── validation-schemas.ts │ ├── server.ts │ └── types.ts ├── tsconfig.build.json └── tsconfig.json ``` # Files -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- ``` dist node_modules *.md *.json .env .env.* coverage .nyc_output ``` -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- ```json { "semi": true, "trailingComma": "all", "singleQuote": true, "printWidth": 120, "tabWidth": 2, "useTabs": false, "bracketSpacing": true, "arrowParens": "always", "endOfLine": "lf" } ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` # Dependencies node_modules/ npm-debug.log* yarn-debug.log* yarn-error.log* # Build output dist/ build/ *.js *.js.map *.d.ts *.d.ts.map # Environment .env .env.local .env.*.local # IDE .vscode/ .idea/ *.swp *.swo *~ # OS .DS_Store Thumbs.db # Logs logs/ *.log # Testing coverage/ .nyc_output/ src/__tests__/mock-responses.json # Temporary files tmp/ temp/ add-to-claude.sh ``` -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- ``` # Required: URL of your Crawl4AI server CRAWL4AI_BASE_URL=http://localhost:11235 # Optional: API key for authentication (if your server requires it) CRAWL4AI_API_KEY= # Optional: Custom server name and version SERVER_NAME=crawl4ai-mcp SERVER_VERSION=0.7.4 # Optional: For LLM extraction tests LLM_PROVIDER=openai/gpt-4o-mini LLM_API_TOKEN=your-llm-api-key LLM_BASE_URL=https://api.openai.com/v1 # If using custom endpoint ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown # MCP Server for Crawl4AI > **Note:** Tested with Crawl4AI version 0.7.4 [](https://www.npmjs.com/package/mcp-crawl4ai-ts) [](https://opensource.org/licenses/MIT) [](https://nodejs.org/) [](https://omgwtfwow.github.io/mcp-crawl4ai-ts/coverage/) TypeScript implementation of an MCP server for Crawl4AI. Provides tools for web crawling, content extraction, and browser automation. ## Table of Contents - [Prerequisites](#prerequisites) - [Quick Start](#quick-start) - [Configuration](#configuration) - [Client-Specific Instructions](#client-specific-instructions) - [Available Tools](#available-tools) - [1. get_markdown](#1-get_markdown---extract-content-as-markdown-with-filtering) - [2. capture_screenshot](#2-capture_screenshot---capture-webpage-screenshot) - [3. generate_pdf](#3-generate_pdf---convert-webpage-to-pdf) - [4. execute_js](#4-execute_js---execute-javascript-and-get-return-values) - [5. batch_crawl](#5-batch_crawl---crawl-multiple-urls-concurrently) - [6. smart_crawl](#6-smart_crawl---auto-detect-and-handle-different-content-types) - [7. get_html](#7-get_html---get-sanitized-html-for-analysis) - [8. extract_links](#8-extract_links---extract-and-categorize-page-links) - [9. crawl_recursive](#9-crawl_recursive---deep-crawl-website-following-links) - [10. parse_sitemap](#10-parse_sitemap---extract-urls-from-xml-sitemaps) - [11. crawl](#11-crawl---advanced-web-crawling-with-full-configuration) - [12. manage_session](#12-manage_session---unified-session-management) - [13. extract_with_llm](#13-extract_with_llm---extract-structured-data-using-ai) - [Advanced Configuration](#advanced-configuration) - [Development](#development) - [License](#license) ## Prerequisites - Node.js 18+ and npm - A running Crawl4AI server ## Quick Start ### 1. Start the Crawl4AI server (for example, local docker) ```bash docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.4 ``` ### 2. Add to your MCP client This MCP server works with any MCP-compatible client (Claude Desktop, Claude Code, Cursor, LMStudio, etc.). #### Using npx (Recommended) ```json { "mcpServers": { "crawl4ai": { "command": "npx", "args": ["mcp-crawl4ai-ts"], "env": { "CRAWL4AI_BASE_URL": "http://localhost:11235" } } } } ``` #### Using local installation ```json { "mcpServers": { "crawl4ai": { "command": "node", "args": ["/path/to/mcp-crawl4ai-ts/dist/index.js"], "env": { "CRAWL4AI_BASE_URL": "http://localhost:11235" } } } } ``` #### With all optional variables ```json { "mcpServers": { "crawl4ai": { "command": "npx", "args": ["mcp-crawl4ai-ts"], "env": { "CRAWL4AI_BASE_URL": "http://localhost:11235", "CRAWL4AI_API_KEY": "your-api-key", "SERVER_NAME": "custom-name", "SERVER_VERSION": "1.0.0" } } } } ``` ## Configuration ### Environment Variables ```env # Required CRAWL4AI_BASE_URL=http://localhost:11235 # Optional - Server Configuration CRAWL4AI_API_KEY= # If your server requires auth SERVER_NAME=crawl4ai-mcp # Custom name for the MCP server SERVER_VERSION=1.0.0 # Custom version ``` ## Client-Specific Instructions ### Claude Desktop Add to `~/Library/Application Support/Claude/claude_desktop_config.json` ### Claude Code ```bash claude mcp add crawl4ai -e CRAWL4AI_BASE_URL=http://localhost:11235 -- npx mcp-crawl4ai-ts ``` ### Other MCP Clients Consult your client's documentation for MCP server configuration. The key details: - **Command**: `npx mcp-crawl4ai-ts` or `node /path/to/dist/index.js` - **Required env**: `CRAWL4AI_BASE_URL` - **Optional env**: `CRAWL4AI_API_KEY`, `SERVER_NAME`, `SERVER_VERSION` ## Available Tools ### 1. `get_markdown` - Extract content as markdown with filtering ```typescript { url: string, // Required: URL to extract markdown from filter?: 'raw'|'fit'|'bm25'|'llm', // Filter type (default: 'fit') query?: string, // Query for bm25/llm filters cache?: string // Cache-bust parameter (default: '0') } ``` Extracts content as markdown with various filtering options. Use 'bm25' or 'llm' filters with a query for specific content extraction. ### 2. `capture_screenshot` - Capture webpage screenshot ```typescript { url: string, // Required: URL to capture screenshot_wait_for?: number // Seconds to wait before screenshot (default: 2) } ``` Returns base64-encoded PNG. Note: This is stateless - for screenshots after JS execution, use `crawl` with `screenshot: true`. ### 3. `generate_pdf` - Convert webpage to PDF ```typescript { url: string // Required: URL to convert to PDF } ``` Returns base64-encoded PDF. Stateless tool - for PDFs after JS execution, use `crawl` with `pdf: true`. ### 4. `execute_js` - Execute JavaScript and get return values ```typescript { url: string, // Required: URL to load scripts: string | string[] // Required: JavaScript to execute } ``` Executes JavaScript and returns results. Each script can use 'return' to get values back. Stateless - for persistent JS execution use `crawl` with `js_code`. ### 5. `batch_crawl` - Crawl multiple URLs concurrently ```typescript { urls: string[], // Required: List of URLs to crawl max_concurrent?: number, // Parallel request limit (default: 5) remove_images?: boolean, // Remove images from output (default: false) bypass_cache?: boolean, // Bypass cache for all URLs (default: false) configs?: Array<{ // Optional: Per-URL configurations (v3.0.0+) url: string, [key: string]: any // Any crawl parameters for this specific URL }> } ``` Efficiently crawls multiple URLs in parallel. Each URL gets a fresh browser instance. With `configs` array, you can specify different parameters for each URL. ### 6. `smart_crawl` - Auto-detect and handle different content types ```typescript { url: string, // Required: URL to crawl max_depth?: number, // Maximum depth for recursive crawling (default: 2) follow_links?: boolean, // Follow links in content (default: true) bypass_cache?: boolean // Bypass cache (default: false) } ``` Intelligently detects content type (HTML/sitemap/RSS) and processes accordingly. ### 7. `get_html` - Get sanitized HTML for analysis ```typescript { url: string // Required: URL to extract HTML from } ``` Returns preprocessed HTML optimized for structure analysis. Use for building schemas or analyzing patterns. ### 8. `extract_links` - Extract and categorize page links ```typescript { url: string, // Required: URL to extract links from categorize?: boolean // Group by type (default: true) } ``` Extracts all links and groups them by type: internal, external, social media, documents, images. ### 9. `crawl_recursive` - Deep crawl website following links ```typescript { url: string, // Required: Starting URL max_depth?: number, // Maximum depth to crawl (default: 3) max_pages?: number, // Maximum pages to crawl (default: 50) include_pattern?: string, // Regex pattern for URLs to include exclude_pattern?: string // Regex pattern for URLs to exclude } ``` Crawls a website following internal links up to specified depth. Returns content from all discovered pages. ### 10. `parse_sitemap` - Extract URLs from XML sitemaps ```typescript { url: string, // Required: Sitemap URL (e.g., /sitemap.xml) filter_pattern?: string // Optional: Regex pattern to filter URLs } ``` Extracts all URLs from XML sitemaps. Supports regex filtering for specific URL patterns. ### 11. `crawl` - Advanced web crawling with full configuration ```typescript { url: string, // URL to crawl // Browser Configuration browser_type?: 'chromium'|'firefox'|'webkit'|'undetected', // Browser engine (undetected = stealth mode) viewport_width?: number, // Browser width (default: 1080) viewport_height?: number, // Browser height (default: 600) user_agent?: string, // Custom user agent proxy_server?: string | { // Proxy URL (string or object format) server: string, username?: string, password?: string }, proxy_username?: string, // Proxy auth (if using string format) proxy_password?: string, // Proxy password (if using string format) cookies?: Array<{name, value, domain}>, // Pre-set cookies headers?: Record<string,string>, // Custom headers // Crawler Configuration word_count_threshold?: number, // Min words per block (default: 200) excluded_tags?: string[], // HTML tags to exclude remove_overlay_elements?: boolean, // Remove popups/modals js_code?: string | string[], // JavaScript to execute wait_for?: string, // Wait condition (selector or JS) wait_for_timeout?: number, // Wait timeout (default: 30000) delay_before_scroll?: number, // Pre-scroll delay scroll_delay?: number, // Between-scroll delay process_iframes?: boolean, // Include iframe content exclude_external_links?: boolean, // Remove external links screenshot?: boolean, // Capture screenshot pdf?: boolean, // Generate PDF session_id?: string, // Reuse browser session (only works with crawl tool) cache_mode?: 'ENABLED'|'BYPASS'|'DISABLED', // Cache control // New in v3.0.0 (Crawl4AI 0.7.3/0.7.4) css_selector?: string, // CSS selector to filter content delay_before_return_html?: number, // Delay in seconds before returning HTML include_links?: boolean, // Include extracted links in response resolve_absolute_urls?: boolean, // Convert relative URLs to absolute // LLM Extraction (REST API only supports 'llm' type) extraction_type?: 'llm', // Only 'llm' extraction is supported via REST API extraction_schema?: object, // Schema for structured extraction extraction_instruction?: string, // Natural language extraction prompt extraction_strategy?: { // Advanced extraction configuration provider?: string, api_key?: string, model?: string, [key: string]: any }, table_extraction_strategy?: { // Table extraction configuration enable_chunking?: boolean, thresholds?: object, [key: string]: any }, markdown_generator_options?: { // Markdown generation options include_links?: boolean, preserve_formatting?: boolean, [key: string]: any }, timeout?: number, // Overall timeout (default: 60000) verbose?: boolean // Detailed logging } ``` ### 12. `manage_session` - Unified session management ```typescript { action: 'create' | 'clear' | 'list', // Required: Action to perform session_id?: string, // For 'create' and 'clear' actions initial_url?: string, // For 'create' action: URL to load browser_type?: 'chromium' | 'firefox' | 'webkit' | 'undetected' // For 'create' action } ``` Unified tool for managing browser sessions. Supports three actions: - **create**: Start a persistent browser session - **clear**: Remove a session from local tracking - **list**: Show all active sessions Examples: ```typescript // Create a new session { action: 'create', session_id: 'my-session', initial_url: 'https://example.com' } // Clear a session { action: 'clear', session_id: 'my-session' } // List all sessions { action: 'list' } ``` ### 13. `extract_with_llm` - Extract structured data using AI ```typescript { url: string, // URL to extract data from query: string // Natural language extraction instructions } ``` Uses AI to extract structured data from webpages. Returns results immediately without any polling or job management. This is the recommended way to extract specific information since CSS/XPath extraction is not supported via the REST API. ## Advanced Configuration For detailed information about all available configuration options, extraction strategies, and advanced features, please refer to the official Crawl4AI documentation: - [Crawl4AI Documentation](https://docs.crawl4ai.com/) - [Crawl4AI GitHub Repository](https://github.com/unclecode/crawl4ai) ## Changelog See [CHANGELOG.md](CHANGELOG.md) for detailed version history and recent updates. ## Development ### Setup ```bash # 1. Start the Crawl4AI server docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest # 2. Install MCP server git clone https://github.com/omgwtfwow/mcp-crawl4ai-ts.git cd mcp-crawl4ai-ts npm install cp .env.example .env # 3. Development commands npm run dev # Development mode npm test # Run tests npm run lint # Check code quality npm run build # Production build # 4. Add to your MCP client (See "Using local installation") ``` ### Running Integration Tests Integration tests require a running Crawl4AI server. Configure your environment: ```bash # Required for integration tests export CRAWL4AI_BASE_URL=http://localhost:11235 export CRAWL4AI_API_KEY=your-api-key # If authentication is required # Optional: For LLM extraction tests export LLM_PROVIDER=openai/gpt-4o-mini export LLM_API_TOKEN=your-llm-api-key export LLM_BASE_URL=https://api.openai.com/v1 # If using custom endpoint # Run integration tests (ALWAYS use the npm script; don't call `jest` directly) npm run test:integration # Run a single integration test file npm run test:integration -- src/__tests__/integration/extract-links.integration.test.ts > IMPORTANT: Do NOT run `npx jest` directly for integration tests. The npm script injects `NODE_OPTIONS=--experimental-vm-modules` which is required for ESM + ts-jest. Running Jest directly will produce `SyntaxError: Cannot use import statement outside a module` and hang. ``` Integration tests cover: - Dynamic content and JavaScript execution - Session management and cookies - Content extraction (LLM-based only) - Media handling (screenshots, PDFs) - Performance and caching - Content filtering - Bot detection avoidance - Error handling ### Integration Test Checklist 1. Docker container healthy: ```bash docker ps --filter name=crawl4ai --format '{{.Names}} {{.Status}}' curl -sf http://localhost:11235/health || echo "Health check failed" ``` 2. Env vars loaded (either exported or in `.env`): `CRAWL4AI_BASE_URL` (required), optional: `CRAWL4AI_API_KEY`, `LLM_PROVIDER`, `LLM_API_TOKEN`, `LLM_BASE_URL`. 3. Use `npm run test:integration` (never raw `jest`). 4. To target one file add it after `--` (see example above). 5. Expect total runtime ~2–3 minutes; longer or immediate hang usually means missing `NODE_OPTIONS` or wrong Jest version. ### Troubleshooting | Symptom | Likely Cause | Fix | |---------|--------------|-----| | `SyntaxError: Cannot use import statement outside a module` | Ran `jest` directly without script flags | Re-run with `npm run test:integration` | | Hangs on first test (RUNS ...) | Missing experimental VM modules flag | Use npm script / ensure `NODE_OPTIONS=--experimental-vm-modules` | | Network timeouts | Crawl4AI container not healthy / DNS blocked | Restart container: `docker restart <name>` | | LLM tests skipped | Missing `LLM_PROVIDER` or `LLM_API_TOKEN` | Export required LLM vars | | New Jest major upgrade breaks tests | Version mismatch with `ts-jest` | Keep Jest 29.x unless `ts-jest` upgraded accordingly | ### Version Compatibility Note Current stack: `[email protected]` + `[email protected]` + ESM (`"type": "module"`). Updating Jest to 30+ requires upgrading `ts-jest` and revisiting `jest.config.cjs`. Keep versions aligned to avoid parse errors. ## License MIT ``` -------------------------------------------------------------------------------- /tsconfig.build.json: -------------------------------------------------------------------------------- ```json { "extends": "./tsconfig.json", "exclude": [ "node_modules", "dist", "src/**/*.test.ts", "src/__tests__/**/*" ] } ``` -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- ```json { "compilerOptions": { "target": "ES2022", "module": "NodeNext", "moduleResolution": "NodeNext", "lib": ["ES2022"], "outDir": "./dist", "rootDir": "./src", "strict": true, "esModuleInterop": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true, "resolveJsonModule": true, "declaration": true, "declarationMap": true, "sourceMap": true, "isolatedModules": true }, "include": ["src/**/*"], "exclude": ["node_modules", "dist"] } ``` -------------------------------------------------------------------------------- /jest.setup.cjs: -------------------------------------------------------------------------------- ``` // Load dotenv for integration tests const dotenv = require('dotenv'); const path = require('path'); // The npm script sets an env var to identify integration tests const isIntegrationTest = process.env.JEST_TEST_TYPE === 'integration'; if (isIntegrationTest) { // For integration tests, load from .env file dotenv.config({ path: path.resolve(__dirname, '.env') }); // For integration tests, we MUST have proper environment variables // No fallback to localhost - tests should fail if not configured } else { // For unit tests, always use localhost process.env.CRAWL4AI_BASE_URL = 'http://localhost:11235'; process.env.CRAWL4AI_API_KEY = 'test-api-key'; } ``` -------------------------------------------------------------------------------- /jest.config.cjs: -------------------------------------------------------------------------------- ``` /** @type {import('jest').Config} */ module.exports = { preset: 'ts-jest/presets/default-esm', testEnvironment: 'node', roots: ['<rootDir>/src'], testMatch: ['**/__tests__/**/*.test.ts'], setupFiles: ['<rootDir>/jest.setup.cjs'], collectCoverageFrom: [ 'src/**/*.ts', '!src/**/__tests__/**', '!src/**/*.test.ts', '!src/**/types/**', ], coverageDirectory: 'coverage', coverageReporters: ['text', 'lcov', 'html', 'json'], moduleNameMapper: { '^(\\.{1,2}/.*)\\.js$': '$1', }, transform: { '^.+\\.tsx?$': [ 'ts-jest', { useESM: true, }, ], }, extensionsToTreatAsEsm: ['.ts'], clearMocks: true, // Limit parallelization for integration tests to avoid overwhelming the server ...(process.env.NODE_ENV === 'test' && process.argv.some(arg => arg.includes('integration')) ? { maxWorkers: 1 } : {}), }; ``` -------------------------------------------------------------------------------- /src/__tests__/types/mocks.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import type { AxiosResponse } from 'axios'; /** * Mock axios instance for testing HTTP client behavior */ export interface MockAxiosInstance { post: jest.Mock<Promise<AxiosResponse>>; get: jest.Mock<Promise<AxiosResponse>>; head: jest.Mock<Promise<AxiosResponse>>; put?: jest.Mock<Promise<AxiosResponse>>; delete?: jest.Mock<Promise<AxiosResponse>>; patch?: jest.Mock<Promise<AxiosResponse>>; } /** * Mock function type that returns a promise with content array */ type MockFunction = jest.Mock<Promise<{ content: TestContent }>>; /** * Mock server interface for MCP server testing */ export interface MockMCPServer { listTools: MockFunction; callTool: MockFunction; listResources?: MockFunction; readResource?: MockFunction; listPrompts?: MockFunction; getPrompt?: MockFunction; } /** * Type for test content arrays used in MCP responses */ export type TestContent = Array<{ type: string; text?: string; resource?: { uri: string; mimeType: string; blob?: string; }; }>; /** * Generic test response type */ export interface TestResponse<T = unknown> { content: TestContent; data?: T; error?: string; } ``` -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- ```typescript #!/usr/bin/env node import { Crawl4AIServer } from './server.js'; // Try to load dotenv only in development // In production (via npx), env vars come from the MCP client try { // Only try to load dotenv if CRAWL4AI_BASE_URL is not set if (!process.env.CRAWL4AI_BASE_URL) { const dotenv = await import('dotenv'); dotenv.config(); } } catch { // dotenv is not available in production, which is expected } const CRAWL4AI_BASE_URL = process.env.CRAWL4AI_BASE_URL; const CRAWL4AI_API_KEY = process.env.CRAWL4AI_API_KEY || ''; const SERVER_NAME = process.env.SERVER_NAME || 'crawl4ai-mcp'; const SERVER_VERSION = process.env.SERVER_VERSION || '1.0.0'; if (!CRAWL4AI_BASE_URL) { console.error('Error: CRAWL4AI_BASE_URL environment variable is required'); console.error('Please set it to your Crawl4AI server URL (e.g., http://localhost:8080)'); process.exit(1); } // Always start the server when this script is executed // This script is meant to be run as an MCP server const server = new Crawl4AIServer(CRAWL4AI_BASE_URL, CRAWL4AI_API_KEY, SERVER_NAME, SERVER_VERSION); server.start().catch((err) => { console.error('Server failed to start:', err); process.exit(1); }); ``` -------------------------------------------------------------------------------- /.github/CI.md: -------------------------------------------------------------------------------- ```markdown # GitHub Actions CI/CD This project uses GitHub Actions for continuous integration. ## Workflows ### CI (`ci.yml`) Runs on every push to main and on pull requests: - Linting (ESLint) - Code formatting check (Prettier) - Build (TypeScript compilation) - Unit tests (with nock mocks) - Test coverage report Tests run on Node.js 18.x and 20.x. ## Mock Maintenance The unit tests use [nock](https://github.com/nock/nock) for HTTP mocking. This provides: - Fast test execution (~1 second) - Predictable test results - No external dependencies during CI **How to update mocks:** Option 1 - Generate mock code from real API: ```bash # This will call the real API and generate nock mock code CRAWL4AI_API_KEY=your-key npm run generate-mocks ``` Option 2 - View API responses as JSON: ```bash # This will save responses to mock-responses.json for inspection CRAWL4AI_API_KEY=your-key npm run view-mocks ``` Option 3 - Manual update: 1. Run integration tests to see current API behavior: `npm run test:integration` 2. Update the mock responses in `src/__tests__/crawl4ai-service.test.ts` 3. Ensure unit tests pass: `npm run test:unit` The mocks are intentionally simple and focus on testing our code's behavior, not the API's exact responses. ## Running Tests Locally ```bash # Run all tests npm test # Run only unit tests (fast, with mocks) npm run test:unit # Run only integration tests (slow, real API) npm run test:integration # Run with coverage npm run test:coverage ``` ``` -------------------------------------------------------------------------------- /src/handlers/base-handler.ts: -------------------------------------------------------------------------------- ```typescript import { Crawl4AIService } from '../crawl4ai-service.js'; import { AxiosInstance } from 'axios'; // Error handling types export interface ErrorWithResponse { response?: { data?: | { detail?: string; } | string | unknown; }; message?: string; } export interface SessionInfo { id: string; created_at: Date; last_used: Date; initial_url?: string; metadata?: Record<string, unknown>; } export abstract class BaseHandler { protected service: Crawl4AIService; protected axiosClient: AxiosInstance; protected sessions: Map<string, SessionInfo>; constructor(service: Crawl4AIService, axiosClient: AxiosInstance, sessions: Map<string, SessionInfo>) { this.service = service; this.axiosClient = axiosClient; this.sessions = sessions; } protected formatError(error: unknown, operation: string): Error { const errorWithResponse = error as ErrorWithResponse; let errorMessage = ''; const data = errorWithResponse.response?.data; if (typeof data === 'object' && data && 'detail' in data) { errorMessage = (data as { detail: string }).detail; } else if (data) { // If data is an object, stringify it errorMessage = typeof data === 'object' ? JSON.stringify(data) : String(data); } else if (error instanceof Error) { errorMessage = error.message; } else { errorMessage = String(error); } return new Error(`Failed to ${operation}: ${errorMessage}`); } } ``` -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- ``` import eslint from '@eslint/js'; import tseslint from '@typescript-eslint/eslint-plugin'; import tsparser from '@typescript-eslint/parser'; import prettier from 'eslint-config-prettier'; import prettierPlugin from 'eslint-plugin-prettier'; export default [ eslint.configs.recommended, prettier, { files: ['src/**/*.ts'], languageOptions: { parser: tsparser, parserOptions: { project: './tsconfig.json', ecmaVersion: 'latest', sourceType: 'module', }, globals: { console: 'readonly', process: 'readonly', Buffer: 'readonly', __dirname: 'readonly', __filename: 'readonly', setTimeout: 'readonly', clearTimeout: 'readonly', setInterval: 'readonly', clearInterval: 'readonly', URL: 'readonly', }, }, plugins: { '@typescript-eslint': tseslint, prettier: prettierPlugin, }, rules: { ...tseslint.configs.recommended.rules, '@typescript-eslint/explicit-function-return-type': 'off', '@typescript-eslint/explicit-module-boundary-types': 'off', '@typescript-eslint/no-explicit-any': 'warn', '@typescript-eslint/no-unused-vars': [ 'error', { argsIgnorePattern: '^_', varsIgnorePattern: '^_', }, ], '@typescript-eslint/no-misused-promises': [ 'error', { checksVoidReturn: false, }, ], 'prettier/prettier': 'error', }, }, { files: ['src/**/*.test.ts', 'src/**/*.integration.test.ts', 'src/**/test-utils.ts', 'src/__tests__/types/*.ts'], languageOptions: { globals: { describe: 'readonly', it: 'readonly', expect: 'readonly', beforeEach: 'readonly', afterEach: 'readonly', beforeAll: 'readonly', afterAll: 'readonly', jest: 'readonly', }, }, }, { ignores: ['dist/**', 'node_modules/**', '*.js', '*.mjs', '*.cjs', 'coverage/**'], }, ]; ``` -------------------------------------------------------------------------------- /src/schemas/helpers.ts: -------------------------------------------------------------------------------- ```typescript import { z } from 'zod'; // Helper to validate JavaScript code export const validateJavaScriptCode = (code: string): boolean => { // Check for common HTML entities that shouldn't be in JS if (/"|&|<|>|&#\d+;|&\w+;/.test(code)) { return false; } // Basic check to ensure it's not HTML if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) { return false; } // Check for literal \n, \t, \r outside of strings (common LLM mistake) // This is tricky - we'll check if the code has these patterns in a way that suggests // they're meant to be actual newlines/tabs rather than escape sequences in strings // Look for patterns like: ;\n or }\n or )\n which suggest literal newlines if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) { return false; } // Check for obvious cases of literal \n between statements if (/[;})]\s*\\n\s*\w/.test(code)) { return false; } return true; }; // Helper to create schema that rejects session_id export const createStatelessSchema = <T extends z.ZodObject<z.ZodRawShape>>(schema: T, toolName: string) => { // Tool-specific guidance for common scenarios const toolGuidance: Record<string, string> = { capture_screenshot: 'To capture screenshots with sessions, use crawl(session_id, screenshot: true)', generate_pdf: 'To generate PDFs with sessions, use crawl(session_id, pdf: true)', execute_js: 'To run JavaScript with sessions, use crawl(session_id, js_code: [...])', get_html: 'To get HTML with sessions, use crawl(session_id)', extract_with_llm: 'To extract data with sessions, first use crawl(session_id) then extract from the response', }; const message = `${toolName} does not support session_id. This tool is stateless - each call creates a new browser. ${ toolGuidance[toolName] || 'For persistent operations, use crawl with session_id.' }`; return z .object({ session_id: z.never({ message }).optional(), }) .passthrough() .and(schema) .transform((data) => { const { session_id, ...rest } = data; if (session_id !== undefined) { throw new Error(message); } return rest as z.infer<T>; }); }; ``` -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- ```yaml name: CI on: push: branches: [ main ] pull_request: branches: [ main ] permissions: contents: write pages: write id-token: write jobs: test: runs-on: ubuntu-latest strategy: matrix: node-version: [18.x, 20.x, 22.x] steps: - uses: actions/checkout@v4 - name: Use Node.js ${{ matrix.node-version }} uses: actions/setup-node@v4 with: node-version: ${{ matrix.node-version }} cache: 'npm' - name: Install dependencies run: npm ci - name: Run linter run: npm run lint - name: Check formatting run: npm run format:check - name: Build run: npm run build - name: Run unit tests run: npm run test:unit - name: Generate coverage report if: matrix.node-version == '18.x' run: npm run test:coverage -- --testPathIgnorePatterns=integration --testPathIgnorePatterns=examples - name: Upload coverage reports if: matrix.node-version == '18.x' uses: actions/upload-artifact@v4 with: name: coverage-report path: coverage/ - name: Update coverage gist if: matrix.node-version == '18.x' env: GIST_SECRET: ${{ secrets.GIST_SECRET }} run: | # Extract coverage percentage from lcov.info COVERAGE=$(awk -F: '/^SF:/{files++} /^LF:/{lines+=$2} /^LH:/{hits+=$2} END {printf "%.0f", (hits/lines)*100}' coverage/lcov.info) # Determine color based on coverage if [ $COVERAGE -ge 90 ]; then COLOR="brightgreen" elif [ $COVERAGE -ge 70 ]; then COLOR="green" elif [ $COVERAGE -ge 50 ]; then COLOR="yellow" elif [ $COVERAGE -ge 30 ]; then COLOR="orange" else COLOR="red"; fi # Update gist echo "{\"schemaVersion\":1,\"label\":\"coverage\",\"message\":\"${COVERAGE}%\",\"color\":\"${COLOR}\"}" > coverage.json gh auth login --with-token <<< "$GIST_SECRET" gh gist edit e2abffb0deb25afa2bf9185f440dae81 coverage.json - name: Deploy coverage to GitHub Pages if: matrix.node-version == '18.x' && github.ref == 'refs/heads/main' uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./coverage/lcov-report destination_dir: coverage ``` -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- ```json { "name": "mcp-crawl4ai-ts", "version": "3.0.2", "description": "TypeScript MCP server for Crawl4AI - web crawling and content extraction", "main": "dist/index.js", "bin": { "mcp-crawl4ai-ts": "dist/index.js" }, "type": "module", "engines": { "node": ">=18.0.0" }, "scripts": { "build": "tsc -p tsconfig.build.json", "start": "node dist/index.js", "dev": "tsx src/index.ts", "test": "NODE_OPTIONS=--experimental-vm-modules jest", "test:watch": "NODE_OPTIONS=--experimental-vm-modules jest --watch", "test:coverage": "NODE_OPTIONS=--experimental-vm-modules jest --coverage", "test:unit": "NODE_OPTIONS=--experimental-vm-modules jest --testPathIgnorePatterns=integration --testPathIgnorePatterns=examples", "test:integration": "JEST_TEST_TYPE=integration NODE_OPTIONS=--experimental-vm-modules jest src/__tests__/integration", "test:ci": "NODE_OPTIONS=--experimental-vm-modules jest --coverage --maxWorkers=2", "lint": "eslint src --ext .ts", "lint:fix": "eslint src --ext .ts --fix", "format": "prettier --write \"src/**/*.ts\"", "format:check": "prettier --check \"src/**/*.ts\"", "check": "npm run lint && npm run format:check && npm run build" }, "keywords": [ "mcp", "crawl4ai", "web-scraping", "markdown", "pdf", "screenshot" ], "author": "Juan González Cano", "license": "MIT", "repository": { "type": "git", "url": "git+https://github.com/omgwtfwow/mcp-crawl4ai-ts.git" }, "bugs": { "url": "https://github.com/omgwtfwow/mcp-crawl4ai-ts/issues" }, "homepage": "https://github.com/omgwtfwow/mcp-crawl4ai-ts#readme", "files": [ "dist/**/*", "README.md", "LICENSE" ], "dependencies": { "@modelcontextprotocol/sdk": "^1.0.4", "axios": "^1.7.9", "dotenv": "^16.4.7", "zod": "^3.25.76" }, "devDependencies": { "@eslint/js": "^9.32.0", "@jest/globals": "^29.7.0", "@types/jest": "^29.5.12", "@types/nock": "^10.0.3", "@types/node": "^22.10.6", "@typescript-eslint/eslint-plugin": "^8.38.0", "@typescript-eslint/parser": "^8.38.0", "diff": "^8.0.2", "eslint": "^9.32.0", "eslint-config-prettier": "^10.1.8", "eslint-plugin-prettier": "^5.5.3", "jest": "^29.7.0", "nock": "^14.0.8", "prettier": "^3.6.2", "ts-jest": "^29.4.0", "tsx": "^4.19.2", "typescript": "^5.7.3" } } ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/session-handlers.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { jest } from '@jest/globals'; import { AxiosError } from 'axios'; import type { SessionHandlers as SessionHandlersType } from '../../handlers/session-handlers.js'; // Mock axios before importing SessionHandlers const mockPost = jest.fn(); const mockAxiosClient = { post: mockPost, }; // Mock the service const mockService = {} as unknown; // Import after setting up mocks const { SessionHandlers } = await import('../../handlers/session-handlers.js'); describe('SessionHandlers', () => { let handler: SessionHandlersType; let sessions: Map<string, unknown>; beforeEach(() => { jest.clearAllMocks(); sessions = new Map(); handler = new SessionHandlers(mockService, mockAxiosClient as unknown, sessions); }); describe('createSession', () => { it('should handle initial crawl failure gracefully', async () => { // Mock failed crawl mockPost.mockRejectedValue( new AxiosError('Request failed with status code 500', 'ERR_BAD_RESPONSE', undefined, undefined, { status: 500, statusText: 'Internal Server Error', data: 'Internal Server Error', headers: {}, config: {} as unknown, } as unknown), ); const options = { initial_url: 'https://this-domain-definitely-does-not-exist-12345.com', browser_type: 'chromium' as const, }; // Create session with initial_url that will fail const result = await handler.createSession(options); // Session should still be created expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toContain('Session created successfully'); expect(result.content[0].text).toContain( 'Pre-warmed with: https://this-domain-definitely-does-not-exist-12345.com', ); expect(result.session_id).toBeDefined(); expect(result.browser_type).toBe('chromium'); // Verify crawl was attempted expect(mockPost).toHaveBeenCalledWith( '/crawl', { urls: ['https://this-domain-definitely-does-not-exist-12345.com'], browser_config: { headless: true, browser_type: 'chromium', }, crawler_config: { session_id: expect.stringMatching(/^session-/), cache_mode: 'BYPASS', }, }, { timeout: 30000, }, ); // Verify session was stored locally expect(sessions.size).toBe(1); const session = sessions.get(result.session_id); expect(session).toBeDefined(); expect(session.initial_url).toBe('https://this-domain-definitely-does-not-exist-12345.com'); }); it('should not attempt crawl when no initial_url provided', async () => { const result = await handler.createSession({}); // Session should be created without crawl expect(result.content[0].text).toContain('Session created successfully'); expect(result.content[0].text).toContain('Ready for use'); expect(result.content[0].text).not.toContain('Pre-warmed'); // Verify no crawl was attempted expect(mockPost).not.toHaveBeenCalled(); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/generate-pdf.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; resource?: { uri: string; mimeType?: string; blob?: string; }; }>; } describe('generate_pdf Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('PDF generation', () => { it( 'should generate PDF from URL', async () => { const result = await client.callTool({ name: 'generate_pdf', arguments: { url: 'https://httpbin.org/html', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(2); // First item should be the PDF as embedded resource expect(content[0].type).toBe('resource'); expect(content[0].resource).toBeDefined(); expect(content[0].resource?.mimeType).toBe('application/pdf'); expect(content[0].resource?.blob).toBeTruthy(); expect(content[0].resource?.blob?.length).toBeGreaterThan(1000); // Should be a substantial base64 string expect(content[0].resource?.uri).toContain('data:application/pdf'); // Second item should be text description expect(content[1].type).toBe('text'); expect(content[1].text).toContain('PDF generated for: https://httpbin.org/html'); }, TEST_TIMEOUTS.long, ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'generate_pdf', arguments: { url: 'https://httpbin.org/html', session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'generate_pdf', arguments: { url: 'not-a-valid-url', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'generate_pdf', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); }, TEST_TIMEOUTS.short, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/session-management.integration.test.ts: -------------------------------------------------------------------------------- ```typescript import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; import { Client } from '@modelcontextprotocol/sdk/client/index.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; session_id?: string; browser_type?: string; initial_url?: string; created_at?: string; } describe('Session Management Integration Tests', () => { let client: Client; const createdSessions: string[] = []; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterEach(async () => { // Clean up any sessions created during tests for (const sessionId of createdSessions) { try { await client.callTool({ name: 'manage_session', arguments: { action: 'clear', session_id: sessionId }, }); } catch (e) { // Ignore errors during cleanup console.debug('Cleanup error:', e); } } createdSessions.length = 0; }); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('manage_session', () => { it( 'should create session with auto-generated ID using manage_session', async () => { const result = await client.callTool({ name: 'manage_session', arguments: { action: 'create' }, }); expect(result).toBeDefined(); const typedResult = result as ToolResult; expect(typedResult.content).toBeDefined(); expect(Array.isArray(typedResult.content)).toBe(true); const textContent = typedResult.content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Session created successfully'); // Check returned parameters expect(typedResult.session_id).toBeDefined(); expect(typedResult.session_id).toMatch(/^session-/); expect(typedResult.browser_type).toBe('chromium'); // Track for cleanup createdSessions.push(typedResult.session_id!); }, TEST_TIMEOUTS.short, ); it( 'should clear session using manage_session', async () => { // First create a session const createResult = await client.callTool({ name: 'manage_session', arguments: { action: 'create', session_id: 'test-to-clear', }, }); const typedCreateResult = createResult as ToolResult; createdSessions.push(typedCreateResult.session_id!); // Then clear it const clearResult = await client.callTool({ name: 'manage_session', arguments: { action: 'clear', session_id: 'test-to-clear', }, }); const typedClearResult = clearResult as ToolResult; expect(typedClearResult.content[0].text).toContain('Session cleared successfully'); }, TEST_TIMEOUTS.short, ); it( 'should list sessions using manage_session', async () => { // Create a session first const createResult = await client.callTool({ name: 'manage_session', arguments: { action: 'create', session_id: 'test-list-session', }, }); const typedCreateResult = createResult as ToolResult; createdSessions.push(typedCreateResult.session_id!); // List sessions const listResult = await client.callTool({ name: 'manage_session', arguments: { action: 'list' }, }); const typedListResult = listResult as ToolResult; expect(typedListResult.content[0].text).toContain('Active sessions'); expect(typedListResult.content[0].text).toContain('test-list-session'); }, TEST_TIMEOUTS.short, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/get-html.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('get_html Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('HTML extraction', () => { it( 'should extract HTML from URL', async () => { const result = await client.callTool({ name: 'get_html', arguments: { url: 'https://httpbin.org/html', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); // Should contain processed HTML const html = content[0].text || ''; expect(html).toBeTruthy(); // The HTML endpoint returns sanitized/processed HTML // It might be truncated with "..." expect(html.length).toBeGreaterThan(0); }, TEST_TIMEOUTS.medium, ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'get_html', arguments: { url: 'https://example.com', session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'get_html', arguments: { url: 'not-a-valid-url', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'get_html', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); // According to spec, returns success: true with empty HTML for invalid URLs const html = content[0].text || ''; // Could be empty or contain an error message expect(typeof html).toBe('string'); }, TEST_TIMEOUTS.short, ); it( 'should ignore extra parameters', async () => { const result = await client.callTool({ name: 'get_html', arguments: { url: 'https://example.com', wait_for: '.some-selector', // Should be ignored bypass_cache: true, // Should be ignored }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); // Should still work, ignoring extra params const html = content[0].text || ''; expect(html.length).toBeGreaterThan(0); }, TEST_TIMEOUTS.long, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/capture-screenshot.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; data?: string; mimeType?: string; }>; } describe('capture_screenshot Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Screenshot capture', () => { it( 'should capture screenshot with default wait time', async () => { const result = await client.callTool({ name: 'capture_screenshot', arguments: { url: 'https://httpbin.org/html', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(2); // First item should be the image expect(content[0].type).toBe('image'); expect(content[0].mimeType).toBe('image/png'); expect(content[0].data).toBeTruthy(); expect(content[0].data?.length).toBeGreaterThan(1000); // Should be a substantial base64 string // Second item should be text description expect(content[1].type).toBe('text'); expect(content[1].text).toContain('Screenshot captured for: https://httpbin.org/html'); }, TEST_TIMEOUTS.short, ); it( 'should capture screenshot with custom wait time', async () => { const result = await client.callTool({ name: 'capture_screenshot', arguments: { url: 'https://httpbin.org/html', screenshot_wait_for: 0.5, // Reduced from 3 seconds }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(2); // First item should be the image expect(content[0].type).toBe('image'); expect(content[0].mimeType).toBe('image/png'); expect(content[0].data).toBeTruthy(); // Second item should be text description expect(content[1].type).toBe('text'); expect(content[1].text).toContain('Screenshot captured for: https://httpbin.org/html'); }, TEST_TIMEOUTS.medium, ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'capture_screenshot', arguments: { url: 'https://example.com', session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'capture_screenshot', arguments: { url: 'not-a-valid-url', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'capture_screenshot', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); }, TEST_TIMEOUTS.short, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/test-utils.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; import dotenv from 'dotenv'; // Load environment variables dotenv.config(); export interface IntegrationTestConfig { baseUrl: string; apiKey: string; llmProvider?: string; llmApiToken?: string; llmBaseUrl?: string; } export function getTestConfig(): IntegrationTestConfig { const config: IntegrationTestConfig = { baseUrl: process.env.CRAWL4AI_BASE_URL || '', apiKey: process.env.CRAWL4AI_API_KEY || '', llmProvider: process.env.LLM_PROVIDER, llmApiToken: process.env.LLM_API_TOKEN, llmBaseUrl: process.env.LLM_BASE_URL, }; if (!config.baseUrl) { throw new Error( 'CRAWL4AI_BASE_URL is required for integration tests. Please set it in .env file or environment variable.', ); } return config; } export function hasLLMConfig(): boolean { const config = getTestConfig(); return !!(config.llmProvider && config.llmApiToken); } export async function createTestClient(): Promise<Client> { const transport = new StdioClientTransport({ command: 'tsx', args: ['src/index.ts'], env: { ...process.env, NODE_ENV: 'test', }, cwd: process.cwd(), // Ensure the child process runs in the correct directory }); const client = new Client( { name: 'integration-test-client', version: '1.0.0', }, { capabilities: {}, }, ); await client.connect(transport); return client; } export async function cleanupTestClient(client: Client): Promise<void> { await client.close(); } // Test data generators export function generateSessionId(): string { return `test-session-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`; } export function generateTestUrl(type: 'simple' | 'dynamic' | 'infinite-scroll' | 'auth' = 'simple'): string { const urls = { simple: 'https://example.com', dynamic: 'https://github.com', 'infinite-scroll': 'https://twitter.com', auth: 'https://github.com/login', }; return urls[type]; } // Test result types export interface TestContentItem { type: string; text?: string; data?: string; mimeType?: string; } export interface TestResult { content: TestContentItem[]; } export interface ToolResult { content: TestContentItem[]; isError?: boolean; } // Assertion helpers export async function expectSuccessfulCrawl(result: unknown): Promise<void> { expect(result).toBeDefined(); // Type guard to check if result has content property const typedResult = result as { content?: unknown }; expect(typedResult.content).toBeDefined(); expect(typedResult.content).toBeInstanceOf(Array); const contentArray = typedResult.content as TestContentItem[]; expect(contentArray.length).toBeGreaterThan(0); const textContent = contentArray.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toBeTruthy(); } export async function expectScreenshot(result: unknown): Promise<void> { const typedResult = result as { content?: TestContentItem[] }; expect(typedResult.content).toBeDefined(); const imageContent = typedResult.content?.find((c) => c.type === 'image'); expect(imageContent).toBeDefined(); expect(imageContent?.data).toBeTruthy(); expect(imageContent?.mimeType).toBe('image/png'); } export async function expectExtractedData(result: unknown, expectedKeys: string[]): Promise<void> { const typedResult = result as { content?: TestContentItem[] }; expect(typedResult.content).toBeDefined(); const textContent = typedResult.content?.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Check if extracted data contains expected keys for (const key of expectedKeys) { expect(textContent?.text).toContain(key); } } // Delay helper for tests export function delay(ms: number): Promise<void> { return new Promise((resolve) => setTimeout(resolve, ms)); } // Rate limiter for integration tests let lastRequestTime = 0; export async function rateLimit(minDelayMs: number = 500): Promise<void> { const now = Date.now(); const timeSinceLastRequest = now - lastRequestTime; if (timeSinceLastRequest < minDelayMs) { await delay(minDelayMs - timeSinceLastRequest); } lastRequestTime = Date.now(); } // Skip test if condition is not met export function skipIf(condition: boolean, message: string) { if (condition) { console.log(`⚠️ Skipping test: ${message}`); return true; } return false; } // Test timeout helper export const TEST_TIMEOUTS = { short: 30000, // 30 seconds medium: 60000, // 1 minute long: 120000, // 2 minutes extraLong: 180000, // 3 minutes }; ``` -------------------------------------------------------------------------------- /.github/copilot-instructions.md: -------------------------------------------------------------------------------- ```markdown # Copilot Instructions: `mcp-crawl4ai-ts` Concise, project-specific guidance for AI coding agents. Optimize for correctness, safety, and existing test expectations. ## Architecture & Flow - Entrypoint `src/index.ts`: loads dotenv only if `CRAWL4AI_BASE_URL` unset; fails fast if missing. Passes env + version into `Crawl4AIServer`. - `src/server.ts`: registers MCP tools, keeps a `Map<string, SessionInfo>` for persistent browser sessions, and uses `validateAndExecute` (Zod parse + invariant error message format). Do NOT alter error text pattern: `Invalid parameters for <tool>: ...` (tests & LLM reliability depend on it). - Service layer `src/crawl4ai-service.ts`: pure HTTP wrapper around Crawl4AI endpoints; centralizes axios timeout & error translation (preserve wording like `Request timed out`, `Request failed with status <code>:` — tests rely on these substrings). - Handlers (`src/handlers/*.ts`): orchestration & response shaping (text content arrays). No direct business logic inside server class beyond wiring. - Validation schemas (`src/schemas/validation-schemas.ts` + helpers): all tool inputs defined here. Use `createStatelessSchema` for stateless tools; session/persistent tools have discriminated unions. ## Tool Model - Stateless tools (e.g. `get_markdown`, `capture_screenshot`, `execute_js`) spin up a fresh browser each call. - Session-based operations use `manage_session` (create/list/clear) + `crawl` for persistent state, allowing chained JS + screenshot/pdf in ONE call. Never try to chain separate stateless calls to reflect JS mutations. - Output always returned as base64/text blocks; do not add file system side-effects unless explicitly using a save path param already supported (screenshots: optional local save dir). ## JS & Input Validation Nuances - JS code schema rejects: HTML entities ("), literal `\n` tokens outside strings, embedded HTML tags. Reuse `JsCodeSchema`—do not duplicate logic. - For `get_markdown`: if filter is `bm25` or `llm`, `query` becomes required (enforced via `.refine`). Keep this logic centralized. ## Sessions - `SessionInfo` tracks `created_at` & `last_used`. Update `last_used` whenever a session-based action runs. Don't leak sessions: `clear` must delete map entry. ## Error Handling Pattern - Handlers wrap service calls; on failure use `this.formatError(error, '<operation>')` (see `BaseHandler`). Preserve format: `Failed to <operation>: <detail>`. - Zod validation errors: keep exact join pattern of `path: message` segments. ## Adding / Modifying a Tool (Checklist) 1. Define or extend schema in `validation-schemas.ts` (prefer composing existing small schemas; wrap with `createStatelessSchema` if ephemeral). 2. Add service method if it maps to a new Crawl4AI endpoint (pure HTTP + validation of URL / JS content; reuse existing validators). 3. Implement handler method (assemble request body, post-process response to `content: [{ type: 'text', text }]`). 4. Register in `setupHandlers()` list (tool description should mirror README style & clarify stateless vs session). 5. Write tests: unit (schema + handler success/failure), integration (happy path with mocked or real endpoint). Place under matching folder in `src/__tests__/`. 6. Update README tool table if user-facing, and CHANGELOG + version bump. ## Commands & Workflows - Install: `npm install` - Build: `npm run build` (tsconfig.build.json) - Dev (watch): `npm run dev` - Tests: `npm run test` | unit only: `npm run test:unit` | integration: `npm run test:integration` | coverage: `npm run test:coverage` - Lint/Format: `npm run lint`, `npm run lint:fix`, `npm run format:check` - Pre-flight composite: `npm run check` ### Testing Invariants - NEVER invoke `jest` directly for integration tests; rely on `npm run test:integration` (injects `NODE_OPTIONS=--experimental-vm-modules` + `JEST_TEST_TYPE=integration`). - Unit tests auto-set `CRAWL4AI_BASE_URL` in `jest.setup.cjs`; integration tests require real env vars (`CRAWL4AI_BASE_URL`, optional `CRAWL4AI_API_KEY`, LLM vars) via `.env` or exported. - To run a single integration file: `npm run test:integration -- path/to/file.test.ts`. - Jest pinned at 29.x with `ts-jest@29`; do not upgrade one without the other. - Symptom mapping: import syntax error or hang at first test => you bypassed the npm script. ## Conventions & Invariants - No `any`; prefer `unknown` + narrowing. - Keep responses minimal & textual; do not introduce new top-level fields in tool results without updating all tests. - Timeout remains 120s in axios clients—changing requires test updates. - Commit style: conventional commits; no emojis, AI signoffs, or verbose bodies. ## References - README (tools & examples), CLAUDE.md (contrib rules), CHANGELOG (release notes), coverage report for quality gates. If something is ambiguous, inspect existing handlers first and mirror the closest established pattern before inventing a new one. ``` -------------------------------------------------------------------------------- /src/__tests__/integration/extract-with-llm.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('extract_with_llm Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('LLM extraction', () => { it( 'should extract information about a webpage', async () => { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://httpbin.org/html', query: 'What is the main topic of this page?', }, }); expect(result).toBeTruthy(); const typedResult = result as ToolResult; expect(typedResult.content).toBeDefined(); expect(typedResult.content.length).toBeGreaterThan(0); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should return a meaningful response (LLM responses are non-deterministic) expect(textContent?.text?.length || 0).toBeGreaterThan(10); }, TEST_TIMEOUTS.long, ); it( 'should answer specific questions about content', async () => { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://httpbin.org/json', query: 'What is the slideshow title?', }, }); expect(result).toBeTruthy(); expect(result.content).toBeDefined(); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should provide an answer about the content expect(textContent?.text?.length || 0).toBeGreaterThan(5); }, TEST_TIMEOUTS.long, ); it( 'should handle complex queries', async () => { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://httpbin.org/html', query: 'List all the links found on this page', }, }); expect(result).toBeTruthy(); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should provide a response about links (content may vary) expect(textContent?.text?.length || 0).toBeGreaterThan(10); }, TEST_TIMEOUTS.long, ); }); describe('Error handling', () => { it( 'should handle server without API key configured', async () => { // Note: This test may pass if the server has OPENAI_API_KEY configured // It's here to document the expected behavior const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://httpbin.org/status/200', query: 'What is on this page?', }, }); const typedResult = result as ToolResult; // If it succeeds, we have API key configured if (typedResult.content && typedResult.content.length > 0) { expect(result).toBeTruthy(); } // If it fails, we should get a proper error message else if (typedResult.content[0]?.text?.includes('LLM provider')) { expect(typedResult.content[0].text).toContain('LLM provider'); } }, TEST_TIMEOUTS.medium, ); it( 'should handle invalid URLs', async () => { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'not-a-url', query: 'What is this?', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); expect(textContent?.text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle empty query gracefully', async () => { const result = await client.callTool({ name: 'extract_with_llm', arguments: { url: 'https://example.com', query: '', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); }, TEST_TIMEOUTS.short, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/extract-links.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('extract_links Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Basic functionality', () => { it( 'should extract links with categorization (default)', async () => { const result = await client.callTool({ name: 'extract_links', arguments: { url: 'https://webscraper.io/test-sites', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); expect(Array.isArray(content)).toBe(true); expect(content.length).toBeGreaterThan(0); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Link analysis for https://webscraper.io/test-sites'); // Should show categorized output expect(textContent?.text).toMatch(/internal \(\d+\)/); expect(textContent?.text).toMatch(/external \(\d+\)/); }, TEST_TIMEOUTS.medium, ); it( 'should extract links without categorization', async () => { const result = await client.callTool({ name: 'extract_links', arguments: { url: 'https://webscraper.io/test-sites', categorize: false, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); expect(Array.isArray(content)).toBe(true); expect(content.length).toBeGreaterThan(0); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('All links from https://webscraper.io/test-sites'); // Should NOT show categorized output expect(textContent?.text).not.toMatch(/internal \(\d+\)/); expect(textContent?.text).not.toMatch(/external \(\d+\)/); }, TEST_TIMEOUTS.medium, ); it( 'should handle sites with no links', async () => { // Test with a simple status page const result = await client.callTool({ name: 'extract_links', arguments: { url: 'https://httpstat.us/200', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); }, TEST_TIMEOUTS.medium, ); it( 'should detect JSON endpoints', async () => { const result = await client.callTool({ name: 'extract_links', arguments: { url: 'https://httpbin.org/json', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Should show link analysis (even if empty) expect(textContent?.text).toContain('Link analysis for https://httpbin.org/json'); }, TEST_TIMEOUTS.medium, ); }); describe('Error handling', () => { it( 'should handle invalid URLs', async () => { const result = await client.callTool({ name: 'extract_links', arguments: { url: 'not-a-url', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); expect(textContent?.text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'extract_links', arguments: { url: 'https://this-domain-definitely-does-not-exist-12345.com', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); // Could be various error messages: connection error, DNS error, etc. expect(textContent?.text?.toLowerCase()).toMatch(/error|failed/); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/smart-crawl.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('smart_crawl Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Smart crawling', () => { it( 'should auto-detect HTML content', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbin.org/html', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Smart crawl detected content type:'); expect(text).toContain('html'); }, TEST_TIMEOUTS.medium, ); it( 'should handle sitemap URLs', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbingo.org/xml', max_depth: 1, }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Smart crawl detected content type:'); expect(text.toLowerCase()).toMatch(/xml|sitemap/); }, TEST_TIMEOUTS.medium, ); it( 'should handle follow_links parameter', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbingo.org/xml', follow_links: true, max_depth: 1, }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Smart crawl detected content type:'); }, TEST_TIMEOUTS.long, ); it( 'should detect JSON content', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbin.org/json', }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Smart crawl detected content type:'); }, TEST_TIMEOUTS.medium, ); it( 'should bypass cache when requested', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbin.org/html', bypass_cache: true, }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Smart crawl detected content type:'); }, TEST_TIMEOUTS.medium, ); it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'not-a-valid-url', }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].text).toContain('Error'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; // Non-existent domains cause 500 errors expect(text).toContain('Error'); }, TEST_TIMEOUTS.short, ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbin.org/html', session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content.length).toBeGreaterThanOrEqual(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); }); }); ``` -------------------------------------------------------------------------------- /src/handlers/session-handlers.ts: -------------------------------------------------------------------------------- ```typescript import { BaseHandler } from './base-handler.js'; export class SessionHandlers extends BaseHandler { async manageSession(options: { action: 'create' | 'clear' | 'list'; session_id?: string; initial_url?: string; browser_type?: string; }) { switch (options.action) { case 'create': return this.createSession({ session_id: options.session_id, initial_url: options.initial_url, browser_type: options.browser_type, }); case 'clear': if (!options.session_id) { throw new Error('session_id is required for clear action'); } return this.clearSession({ session_id: options.session_id }); case 'list': return this.listSessions(); default: // This should never happen due to TypeScript types, but handle it for runtime safety throw new Error(`Invalid action: ${(options as { action: string }).action}`); } } private async createSession(options: { session_id?: string; initial_url?: string; browser_type?: string }) { try { // Generate session ID if not provided const sessionId = options.session_id || `session-${Date.now()}-${Math.random().toString(36).substring(2, 11)}`; // Store session info locally this.sessions.set(sessionId, { id: sessionId, created_at: new Date(), last_used: new Date(), initial_url: options.initial_url, metadata: { browser_type: options.browser_type || 'chromium', }, }); // If initial_url provided, make first crawl to establish session if (options.initial_url) { try { await this.axiosClient.post( '/crawl', { urls: [options.initial_url], browser_config: { headless: true, browser_type: options.browser_type || 'chromium', }, crawler_config: { session_id: sessionId, cache_mode: 'BYPASS', }, }, { timeout: 30000, // 30 second timeout for initial crawl }, ); // Update last_used const session = this.sessions.get(sessionId); if (session) { session.last_used = new Date(); } } catch (error) { // Session created but initial crawl failed - still return success console.error(`Initial crawl failed for session ${sessionId}:`, error); } } return { content: [ { type: 'text', text: `Session created successfully:\nSession ID: ${sessionId}\nBrowser: ${options.browser_type || 'chromium'}\n${options.initial_url ? `Pre-warmed with: ${options.initial_url}` : 'Ready for use'}\n\nUse this session_id with the crawl tool to maintain state across requests.`, }, ], // Include all session parameters for easier programmatic access session_id: sessionId, browser_type: options.browser_type || 'chromium', initial_url: options.initial_url, created_at: this.sessions.get(sessionId)?.created_at.toISOString(), }; } catch (error) { throw this.formatError(error, 'create session'); } } private async clearSession(options: { session_id: string }) { try { // Remove from local store const deleted = this.sessions.delete(options.session_id); // Note: The actual browser session in Crawl4AI will be cleaned up // automatically after inactivity or when the server restarts return { content: [ { type: 'text', text: deleted ? `Session cleared successfully: ${options.session_id}` : `Session not found: ${options.session_id}`, }, ], }; } catch (error) { throw this.formatError(error, 'clear session'); } } private async listSessions() { try { // Return locally stored sessions const sessions = Array.from(this.sessions.entries()).map(([id, info]) => { const ageMinutes = Math.floor((Date.now() - info.created_at.getTime()) / 60000); const lastUsedMinutes = Math.floor((Date.now() - info.last_used.getTime()) / 60000); return { session_id: id, created_at: info.created_at.toISOString(), last_used: info.last_used.toISOString(), age_minutes: ageMinutes, last_used_minutes_ago: lastUsedMinutes, initial_url: info.initial_url, browser_type: info.metadata?.browser_type || 'chromium', }; }); if (sessions.length === 0) { return { content: [ { type: 'text', text: 'No active sessions found.', }, ], }; } const sessionList = sessions .map( (session) => `- ${session.session_id} (${session.browser_type}, created ${session.age_minutes}m ago, last used ${session.last_used_minutes_ago}m ago)`, ) .join('\n'); return { content: [ { type: 'text', text: `Active sessions (${sessions.length}):\n${sessionList}`, }, ], }; } catch (error) { throw this.formatError(error, 'list sessions'); } } } ``` -------------------------------------------------------------------------------- /src/__tests__/utils/javascript-validation.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { describe, it, expect } from '@jest/globals'; import { validateJavaScriptCode } from '../../schemas/helpers.js'; describe('JavaScript Code Validation', () => { describe('Valid JavaScript', () => { it('should accept simple JavaScript code', () => { expect(validateJavaScriptCode('console.log("Hello world")')).toBe(true); expect(validateJavaScriptCode('return document.title')).toBe(true); expect(validateJavaScriptCode('const x = 5; return x * 2;')).toBe(true); }); it('should accept JavaScript with real newlines', () => { expect(validateJavaScriptCode('console.log("Hello");\nconsole.log("World");')).toBe(true); expect(validateJavaScriptCode('function test() {\n return true;\n}')).toBe(true); }); it('should accept JavaScript with escape sequences in strings', () => { expect(validateJavaScriptCode('console.log("Line 1\\nLine 2")')).toBe(true); expect(validateJavaScriptCode('const msg = "Tab\\there\\tand\\tthere"')).toBe(true); expect(validateJavaScriptCode('return "Quote: \\"Hello\\""')).toBe(true); }); it('should accept complex JavaScript patterns', () => { const complexCode = ` const elements = document.querySelectorAll('.item'); elements.forEach((el, i) => { el.textContent = \`Item \${i + 1}\`; }); return elements.length; `; expect(validateJavaScriptCode(complexCode)).toBe(true); }); it('should accept JavaScript with regex patterns', () => { expect(validateJavaScriptCode('return /test\\d+/.test(str)')).toBe(true); expect(validateJavaScriptCode('const pattern = /\\w+@\\w+\\.\\w+/')).toBe(true); }); }); describe('Invalid JavaScript - HTML Entities', () => { it('should reject code with HTML entities', () => { expect(validateJavaScriptCode('console.log("Hello")')).toBe(false); expect(validateJavaScriptCode('const x = && true')).toBe(false); expect(validateJavaScriptCode('if (x < 5) return')).toBe(false); expect(validateJavaScriptCode('if (x > 5) return')).toBe(false); }); it('should reject code with numeric HTML entities', () => { expect(validateJavaScriptCode('const char = A')).toBe(false); // Note: hex entities like A are not caught by the current regex }); it('should reject code with named HTML entities', () => { expect(validateJavaScriptCode('const copy = ©')).toBe(false); expect(validateJavaScriptCode('const nbsp = ')).toBe(false); }); }); describe('Invalid JavaScript - HTML Tags', () => { it('should reject HTML markup', () => { expect(validateJavaScriptCode('<!DOCTYPE html>')).toBe(false); expect(validateJavaScriptCode('<html><body>test</body></html>')).toBe(false); expect(validateJavaScriptCode('<script>alert("test")</script>')).toBe(false); expect(validateJavaScriptCode('<style>body { color: red; }</style>')).toBe(false); }); it('should reject mixed HTML and JavaScript', () => { expect(validateJavaScriptCode('<head>\nconst x = 5;\n</head>')).toBe(false); expect(validateJavaScriptCode('console.log("test");\n<body>')).toBe(false); }); }); describe('Invalid JavaScript - Literal Escape Sequences', () => { it('should reject literal \\n outside of strings', () => { expect(validateJavaScriptCode('console.log("Hello");\\nconsole.log("World");')).toBe(false); expect(validateJavaScriptCode('const x = 5;\\nreturn x;')).toBe(false); expect(validateJavaScriptCode('if (true) {\\n return;\\n}')).toBe(false); }); it('should reject literal \\n in various positions', () => { expect(validateJavaScriptCode('}\\nfunction')).toBe(false); expect(validateJavaScriptCode(');\\nconst')).toBe(false); expect(validateJavaScriptCode('\\n{')).toBe(false); expect(validateJavaScriptCode('\\n(')).toBe(false); }); it('should reject literal \\n between statements', () => { expect(validateJavaScriptCode('const x = 5;\\nconst y = 10;')).toBe(false); expect(validateJavaScriptCode('doSomething();\\ndoAnother();')).toBe(false); }); }); describe('Edge Cases', () => { it('should handle empty strings', () => { expect(validateJavaScriptCode('')).toBe(true); }); it('should handle whitespace-only strings', () => { expect(validateJavaScriptCode(' ')).toBe(true); expect(validateJavaScriptCode('\n\n\n')).toBe(true); expect(validateJavaScriptCode('\t\t')).toBe(true); }); it('should handle single-line comments', () => { expect(validateJavaScriptCode('// This is a comment')).toBe(true); expect(validateJavaScriptCode('return 5; // Comment here')).toBe(true); }); it('should handle multi-line comments', () => { expect(validateJavaScriptCode('/* Multi\nline\ncomment */')).toBe(true); expect(validateJavaScriptCode('/* Comment */ return 5;')).toBe(true); }); it('should reject HTML tags even in what looks like strings', () => { // The current validation is quite strict and rejects HTML tags even if they appear to be in strings // This is by design to prevent malformed JavaScript that contains actual HTML expect(validateJavaScriptCode('const html = "<div>Hello</div>"')).toBe(true); // <div> is ok expect(validateJavaScriptCode("return '<style>body{}</style>'")).toBe(false); // <style> is rejected }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/utility-handlers.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { jest } from '@jest/globals'; import type { UtilityHandlers } from '../../handlers/utility-handlers.js'; import type { Crawl4AIService } from '../../crawl4ai-service.js'; // Mock the service const mockCrawl = jest.fn(); const mockService = { crawl: mockCrawl, } as unknown as Crawl4AIService; // Mock axios client const mockPost = jest.fn(); const mockAxiosClient = { post: mockPost, } as unknown; // Import after setting up mocks const { UtilityHandlers: UtilityHandlersClass } = await import('../../handlers/utility-handlers.js'); describe('UtilityHandlers', () => { let handler: UtilityHandlers; let sessions: Map<string, unknown>; beforeEach(() => { jest.clearAllMocks(); sessions = new Map(); handler = new UtilityHandlersClass(mockService, mockAxiosClient, sessions); }); describe('extractLinks', () => { it('should manually extract links from markdown when API returns empty links', async () => { // Mock crawl response with empty links but markdown containing href attributes mockPost.mockResolvedValue({ data: { results: [ { success: true, links: { internal: [], external: [], }, markdown: { raw_markdown: ` # Test Page Here are some links: <a href="https://example.com/page1">Internal Link</a> <a href="https://external.com/page">External Link</a> <a href="/relative/path">Relative Link</a> <a href='https://example.com/page2'>Another Internal</a> `, }, }, ], }, }); const result = await handler.extractLinks({ url: 'https://example.com', categorize: true, }); // Should have manually extracted and categorized links expect(result.content[0].type).toBe('text'); expect(result.content[0].text).toContain('Link analysis for https://example.com'); expect(result.content[0].text).toContain('internal (3)'); expect(result.content[0].text).toContain('https://example.com/page1'); expect(result.content[0].text).toContain('https://example.com/page2'); expect(result.content[0].text).toContain('https://example.com/relative/path'); expect(result.content[0].text).toContain('external (1)'); expect(result.content[0].text).toContain('https://external.com/page'); }); it('should handle manual extraction without categorization', async () => { // Mock crawl response with empty links mockPost.mockResolvedValue({ data: { results: [ { success: true, links: { internal: [], external: [], }, markdown: { raw_markdown: `<a href="https://example.com/page1">Link 1</a> <a href="https://external.com/page">Link 2</a>`, }, }, ], }, }); const result = await handler.extractLinks({ url: 'https://example.com', categorize: false, }); // Should show all links without categorization expect(result.content[0].text).toContain('All links from https://example.com'); expect(result.content[0].text).toContain('https://example.com/page1'); expect(result.content[0].text).toContain('https://external.com/page'); expect(result.content[0].text).not.toContain('Internal links:'); }); it('should handle malformed URLs during manual extraction', async () => { // Mock crawl response with a malformed URL in href mockPost.mockResolvedValue({ data: { results: [ { success: true, links: { internal: [], external: [], }, markdown: { raw_markdown: `<a href="javascript:void(0)">JS Link</a> <a href="https://example.com/valid">Valid Link</a> <a href="not-a-url">Invalid URL</a>`, }, }, ], }, }); const result = await handler.extractLinks({ url: 'https://example.com', categorize: true, }); // Should handle invalid URLs gracefully expect(result.content[0].text).toContain('https://example.com/valid'); // Invalid URLs should be treated as relative links expect(result.content[0].text).toContain('not-a-url'); expect(result.content[0].text).toContain('javascript:void(0)'); }); it('should return empty results when no links found', async () => { // Mock crawl response with no links mockPost.mockResolvedValue({ data: { results: [ { success: true, links: { internal: [], external: [], }, markdown: { raw_markdown: 'Just plain text without any links', }, }, ], }, }); const result = await handler.extractLinks({ url: 'https://example.com', categorize: true, }); // Should show empty categories expect(result.content[0].text).toContain('Link analysis for https://example.com'); expect(result.content[0].text).toContain('internal (0)'); expect(result.content[0].text).toContain('external (0)'); }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/index.cli.test.ts: -------------------------------------------------------------------------------- ```typescript // import { jest } from '@jest/globals'; import { spawn } from 'child_process'; import * as path from 'path'; import * as url from 'url'; const __dirname = url.fileURLToPath(new URL('.', import.meta.url)); describe('CLI Entry Point', () => { const cliPath = path.join(__dirname, '..', '..', 'src', 'index.ts'); // Helper to run CLI with given env vars const runCLI = ( env: Record<string, string> = {}, ): Promise<{ code: number | null; stdout: string; stderr: string }> => { return new Promise((resolve) => { const child = spawn('tsx', [cliPath], { env: { ...process.env, ...env }, stdio: 'pipe', }); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); child.on('close', (code) => { resolve({ code, stdout, stderr }); }); // Kill after 2 seconds to prevent hanging setTimeout(() => { child.kill(); }, 2000); }); }; describe('Environment Variable Validation', () => { it('should exit with code 1 when CRAWL4AI_BASE_URL is missing', async () => { const { code, stderr } = await runCLI({ CRAWL4AI_BASE_URL: '', }); expect(code).toBe(1); expect(stderr).toContain('Error: CRAWL4AI_BASE_URL environment variable is required'); expect(stderr).toContain('Please set it to your Crawl4AI server URL'); }); it('should start successfully with valid CRAWL4AI_BASE_URL', async () => { const { code, stderr } = await runCLI({ CRAWL4AI_BASE_URL: 'http://localhost:11235', CRAWL4AI_API_KEY: 'test-key', }); // Process should be killed by timeout, not exit with error expect(code).not.toBe(1); // MCP servers output to stderr expect(stderr).toContain('crawl4ai-mcp'); }); it('should use default values for optional env vars', async () => { const { stderr } = await runCLI({ CRAWL4AI_BASE_URL: 'http://localhost:11235', // No API_KEY, SERVER_NAME, or SERVER_VERSION }); expect(stderr).toContain('crawl4ai-mcp'); // default server name expect(stderr).toContain('1.0.0'); // default version }); it('should use custom SERVER_NAME and SERVER_VERSION when provided', async () => { const { stderr } = await runCLI({ CRAWL4AI_BASE_URL: 'http://localhost:11235', SERVER_NAME: 'custom-server', SERVER_VERSION: '2.0.0', }); expect(stderr).toContain('custom-server'); expect(stderr).toContain('2.0.0'); }); }); describe('Signal Handling', () => { it('should handle SIGTERM gracefully', async () => { const child = spawn('tsx', [cliPath], { env: { ...process.env, CRAWL4AI_BASE_URL: 'http://localhost:11235', }, stdio: 'pipe', }); // Wait for startup await new Promise((resolve) => setTimeout(resolve, 500)); // Send SIGTERM child.kill('SIGTERM'); const code = await new Promise<number | null>((resolve, reject) => { const timeout = setTimeout(() => { child.kill('SIGKILL'); reject(new Error('Process did not exit in time')); }, 5000); child.on('close', (exitCode) => { clearTimeout(timeout); resolve(exitCode); }); }); // Should exit with signal code expect(code).toBe(143); // 128 + 15 (SIGTERM) // Ensure cleanup child.kill(); }, 10000); it('should handle SIGINT gracefully', async () => { const child = spawn('tsx', [cliPath], { env: { ...process.env, CRAWL4AI_BASE_URL: 'http://localhost:11235', }, stdio: 'pipe', }); // Wait for startup await new Promise((resolve) => setTimeout(resolve, 500)); // Send SIGINT (Ctrl+C) child.kill('SIGINT'); const code = await new Promise<number | null>((resolve, reject) => { const timeout = setTimeout(() => { child.kill('SIGKILL'); reject(new Error('Process did not exit in time')); }, 5000); child.on('close', (exitCode) => { clearTimeout(timeout); resolve(exitCode); }); }); // Should exit with signal code expect(code).toBe(130); // 128 + 2 (SIGINT) // Ensure cleanup child.kill(); }, 10000); }); describe('Error Handling', () => { it('should handle server startup errors', async () => { // This will be tricky to test without mocking, but we can at least // verify the process starts and attempts to connect const { code, stdout, stderr } = await runCLI({ CRAWL4AI_BASE_URL: 'http://invalid-host-that-does-not-exist:99999', }); // Should not exit with code 1 (that's for missing env vars) expect(code).not.toBe(1); // But might log connection errors const output = stdout + stderr; expect(output).toBeTruthy(); }); }); describe('dotenv Loading', () => { it('should load .env file if present', async () => { // Create a temporary .env file const fs = await import('fs/promises'); const envPath = path.join(__dirname, '..', '..', '.env.test'); await fs.writeFile(envPath, 'TEST_ENV_VAR=loaded_from_file\n'); try { const { stderr } = await runCLI({ CRAWL4AI_BASE_URL: 'http://localhost:11235', NODE_ENV: 'test', DOTENV_CONFIG_PATH: envPath, }); // Verify the server starts (dotenv loaded successfully) expect(stderr).toContain('crawl4ai-mcp'); } finally { // Clean up await fs.unlink(envPath).catch(() => {}); } }); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/crawl-recursive.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('crawl_recursive Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Basic functionality', () => { it( 'should crawl a site recursively with default settings', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/links/5/0', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); expect(Array.isArray(content)).toBe(true); expect(content.length).toBeGreaterThan(0); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Recursive crawl completed'); expect(textContent?.text).toContain('Pages crawled:'); expect(textContent?.text).toContain('Max depth reached:'); expect(textContent?.text).toContain('Only internal links'); // Should have found multiple pages since httpbin.org/links/5/0 has internal links expect(textContent?.text).toMatch(/Pages crawled: [2-9]|[1-9][0-9]/); }, TEST_TIMEOUTS.long, ); it( 'should respect max_depth parameter', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/links/10/0', max_depth: 1, max_pages: 5, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Max depth reached: '); expect(textContent?.text).toMatch(/Max depth reached: [0-1] \(limit: 1\)/); // With max_depth=1, should find some pages but not go too deep expect(textContent?.text).toMatch(/Pages crawled: [1-5]/); }, TEST_TIMEOUTS.long, ); it( 'should apply include pattern filter', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/links/10/0', max_depth: 1, max_pages: 5, include_pattern: '.*/links/[0-9]+/[0-4]$', // Only include links ending with 0-4 }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Check that we have some results expect(textContent?.text).toContain('Pages crawled:'); // If we crawled pages, they should match our pattern if (textContent?.text && textContent.text.includes('Pages found:')) { const pagesSection = textContent.text.split('Pages found:')[1]; if (pagesSection && pagesSection.trim()) { // All URLs should end with /0, /1, /2, /3, or /4 expect(pagesSection).toMatch(/\/[0-4]\b/); // Should NOT have URLs ending with /5, /6, /7, /8, /9 expect(pagesSection).not.toMatch(/\/[5-9]\b/); } } }, TEST_TIMEOUTS.long, ); it( 'should apply exclude pattern filter', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://example.com', max_depth: 2, max_pages: 10, exclude_pattern: '.*\\.(pdf|zip|exe)$', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Should not have crawled any PDF, ZIP, or EXE files expect(textContent?.text).not.toMatch(/\.(pdf|zip|exe)/i); }, TEST_TIMEOUTS.long, ); }); describe('Error handling', () => { it( 'should handle invalid URLs', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'not-a-url', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); expect(textContent?.text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle sites with internal links', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/links/5/0', max_depth: 2, max_pages: 10, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Pages crawled:'); // Should crawl multiple pages since httpbin.org/links/5/0 has 5 internal links expect(textContent?.text).toMatch(/Pages crawled: [2-9]|1[0-9]/); expect(textContent?.text).toContain('Internal links found:'); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/handlers/content-handlers.ts: -------------------------------------------------------------------------------- ```typescript import { BaseHandler } from './base-handler.js'; import { MarkdownEndpointOptions, MarkdownEndpointResponse, ScreenshotEndpointOptions, ScreenshotEndpointResponse, PDFEndpointOptions, PDFEndpointResponse, HTMLEndpointOptions, HTMLEndpointResponse, FilterType, } from '../types.js'; import * as fs from 'fs/promises'; import * as path from 'path'; import * as os from 'os'; export class ContentHandlers extends BaseHandler { async getMarkdown( options: Omit<MarkdownEndpointOptions, 'f' | 'q' | 'c'> & { filter?: string; query?: string; cache?: string }, ) { try { // Map from schema property names to API parameter names const result: MarkdownEndpointResponse = await this.service.getMarkdown({ url: options.url, f: options.filter as FilterType | undefined, // Schema provides 'filter', API expects 'f' q: options.query, // Schema provides 'query', API expects 'q' c: options.cache, // Schema provides 'cache', API expects 'c' }); // Format the response let formattedText = `URL: ${result.url}\nFilter: ${result.filter}`; if (result.query) { formattedText += `\nQuery: ${result.query}`; } formattedText += `\nCache: ${result.cache}\n\nMarkdown:\n${result.markdown || 'No content found.'}`; return { content: [ { type: 'text', text: formattedText, }, ], }; } catch (error) { throw this.formatError(error, 'get markdown'); } } async captureScreenshot(options: ScreenshotEndpointOptions) { try { const result: ScreenshotEndpointResponse = await this.service.captureScreenshot(options); // Response has { success: true, screenshot: "base64string" } if (!result.success || !result.screenshot) { throw new Error('Screenshot capture failed - no screenshot data in response'); } let savedFilePath: string | undefined; // Save to local directory if requested if (options.save_to_directory) { try { // Resolve home directory path let resolvedPath = options.save_to_directory; if (resolvedPath.startsWith('~')) { const homedir = os.homedir(); resolvedPath = path.join(homedir, resolvedPath.slice(1)); } // Check if user provided a file path instead of directory if (resolvedPath.endsWith('.png') || resolvedPath.endsWith('.jpg')) { console.warn( `Warning: save_to_directory should be a directory path, not a file path. Using parent directory.`, ); resolvedPath = path.dirname(resolvedPath); } // Ensure directory exists await fs.mkdir(resolvedPath, { recursive: true }); // Generate filename from URL and timestamp const url = new URL(options.url); const hostname = url.hostname.replace(/[^a-z0-9]/gi, '-'); const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, -5); const filename = `${hostname}-${timestamp}.png`; savedFilePath = path.join(resolvedPath, filename); // Convert base64 to buffer and save const buffer = Buffer.from(result.screenshot, 'base64'); await fs.writeFile(savedFilePath, buffer); } catch (saveError) { // Log error but don't fail the operation console.error('Failed to save screenshot locally:', saveError); } } const textContent = savedFilePath ? `Screenshot captured for: ${options.url}\nSaved to: ${savedFilePath}` : `Screenshot captured for: ${options.url}`; // If saved locally and screenshot is large (>800KB), don't return the base64 data const screenshotSize = Buffer.from(result.screenshot, 'base64').length; const shouldReturnImage = !savedFilePath || screenshotSize < 800 * 1024; // 800KB threshold const content = []; if (shouldReturnImage) { content.push({ type: 'image', data: result.screenshot, mimeType: 'image/png', }); } content.push({ type: 'text', text: shouldReturnImage ? textContent : `${textContent}\n\nNote: Screenshot data not returned due to size (${Math.round(screenshotSize / 1024)}KB). View the saved file instead.`, }); return { content }; } catch (error) { throw this.formatError(error, 'capture screenshot'); } } async generatePDF(options: PDFEndpointOptions) { try { const result: PDFEndpointResponse = await this.service.generatePDF(options); // Response has { success: true, pdf: "base64string" } if (!result.success || !result.pdf) { throw new Error('PDF generation failed - no PDF data in response'); } return { content: [ { type: 'resource', resource: { uri: `data:application/pdf;name=${encodeURIComponent(new URL(String(options.url)).hostname)}.pdf;base64,${result.pdf}`, mimeType: 'application/pdf', blob: result.pdf, }, }, { type: 'text', text: `PDF generated for: ${options.url}`, }, ], }; } catch (error) { throw this.formatError(error, 'generate PDF'); } } async getHTML(options: HTMLEndpointOptions) { try { const result: HTMLEndpointResponse = await this.service.getHTML(options); // Response has { html: string, url: string, success: true } return { content: [ { type: 'text', text: result.html || '', }, ], }; } catch (error) { throw this.formatError(error, 'get HTML'); } } async extractWithLLM(options: { url: string; query: string }) { try { const result = await this.service.extractWithLLM(options); return { content: [ { type: 'text', text: result.answer, }, ], }; } catch (error) { throw this.formatError(error, 'extract with LLM'); } } } ``` -------------------------------------------------------------------------------- /src/__tests__/integration/get-markdown.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('get_markdown Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Markdown extraction', () => { it( 'should extract markdown with default fit filter', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('URL: https://httpbin.org/html'); expect(text).toContain('Filter: fit'); expect(text).toContain('Markdown:'); }, TEST_TIMEOUTS.medium, ); it( 'should extract markdown with raw filter', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', filter: 'raw', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Filter: raw'); }, TEST_TIMEOUTS.medium, ); it( 'should extract markdown with bm25 filter and query', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', filter: 'bm25', query: 'Herman Melville', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Filter: bm25'); expect(text).toContain('Query: Herman Melville'); }, TEST_TIMEOUTS.medium, ); it( 'should extract markdown with llm filter and query', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', filter: 'llm', query: 'What is this page about?', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Filter: llm'); expect(text).toContain('Query: What is this page about?'); }, TEST_TIMEOUTS.medium, ); it( 'should use cache parameter', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', cache: '1', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Cache: 1'); }, TEST_TIMEOUTS.medium, ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'not-a-valid-url', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent domains', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://this-domain-definitely-does-not-exist-123456789.com', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); // According to the pattern from other tests, might return success with empty content const text = content[0].text || ''; expect(typeof text).toBe('string'); }, TEST_TIMEOUTS.short, ); it( 'should ignore extra parameters', async () => { const result = await client.callTool({ name: 'get_markdown', arguments: { url: 'https://httpbin.org/html', filter: 'fit', // These should be ignored remove_images: true, bypass_cache: true, screenshot: true, }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); // Should still work, ignoring extra params const text = content[0].text || ''; expect(text).toContain('Filter: fit'); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/execute-js.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('execute_js Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('JavaScript execution', () => { it( 'should execute JavaScript and return results', async () => { const result = await client.callTool({ name: 'execute_js', arguments: { url: 'https://httpbin.org/html', scripts: ['return document.title', 'return document.querySelectorAll("h1").length'], }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); // Should contain JavaScript execution results expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html'); expect(content[0].text).toContain('Results:'); expect(content[0].text).toContain('Script: return document.title'); expect(content[0].text).toMatch(/Returned: .*/); // Title may be empty or no return value expect(content[0].text).toContain('Script: return document.querySelectorAll("h1").length'); expect(content[0].text).toContain('Returned: 1'); // Should have 1 h1 element }, TEST_TIMEOUTS.medium, ); it( 'should execute single script as string', async () => { console.log('Starting execute_js test...'); const result = await client.callTool({ name: 'execute_js', arguments: { url: 'https://httpbin.org/html', scripts: 'return window.location.href', }, }); console.log('Got result:', result); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html'); expect(content[0].text).toContain('Script: return window.location.href'); expect(content[0].text).toContain('Returned: "https://httpbin.org/html'); }, TEST_TIMEOUTS.long, // Increase timeout to 120s ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'execute_js', arguments: { url: 'https://httpbin.org/html', scripts: 'return true', session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); it( 'should reject invalid JavaScript with HTML entities', async () => { const result = await client.callTool({ name: 'execute_js', arguments: { url: 'https://httpbin.org/html', scripts: 'return "test"', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].text).toContain('Error'); expect(content[0].text).toContain('Invalid JavaScript'); expect(content[0].text).toContain('HTML entities'); }, TEST_TIMEOUTS.short, ); it( 'should accept JavaScript with newlines in strings', async () => { const result = await client.callTool({ name: 'execute_js', arguments: { url: 'https://httpbin.org/html', scripts: 'const text = "line1\\nline2"; return text', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html'); expect(content[0].text).toContain('Returned: "line1\\nline2"'); }, TEST_TIMEOUTS.medium, // Increase from short to medium ); it( 'should handle JavaScript execution errors', async () => { const result = await client.callTool({ name: 'execute_js', arguments: { url: 'https://httpbin.org/html', scripts: [ 'return "This works"', 'throw new Error("This is a test error")', 'nonExistentVariable.someMethod()', ], }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].text).toContain('JavaScript executed on: https://httpbin.org/html'); // First script should succeed expect(content[0].text).toContain('Script: return "This works"'); expect(content[0].text).toContain('Returned: "This works"'); // Second script should show error expect(content[0].text).toContain('Script: throw new Error("This is a test error")'); expect(content[0].text).toContain('Returned: Error: Error: This is a test error'); // Third script should show reference error expect(content[0].text).toContain('Script: nonExistentVariable.someMethod()'); expect(content[0].text).toContain('Returned: Error: ReferenceError: nonExistentVariable is not defined'); }, TEST_TIMEOUTS.medium, ); it( 'should handle invalid URLs gracefully', async () => { const result = await client.callTool({ name: 'execute_js', arguments: { url: 'not-a-valid-url', scripts: 'return true', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/batch-crawl.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('batch_crawl Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Batch crawling', () => { it( 'should crawl multiple URLs', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbingo.org/html', 'https://httpbingo.org/json'], }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Batch crawl completed'); expect(text).toContain('Processed 2 URLs'); expect(text).toContain('https://httpbingo.org/html: Success'); expect(text).toContain('https://httpbingo.org/json: Success'); }, TEST_TIMEOUTS.medium, ); it( 'should handle max_concurrent parameter', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbingo.org/html', 'https://httpbingo.org/xml', 'https://httpbingo.org/json'], max_concurrent: 1, }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Processed 3 URLs'); expect(text).toContain(': Success'); }, TEST_TIMEOUTS.long, ); it( 'should remove images when requested', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbingo.org/html'], remove_images: true, }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Batch crawl completed'); expect(text).toContain('https://httpbingo.org/html: Success'); }, TEST_TIMEOUTS.medium, ); it( 'should bypass cache when requested', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbingo.org/html'], bypass_cache: true, }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Batch crawl completed'); expect(text).toContain('https://httpbingo.org/html: Success'); }, TEST_TIMEOUTS.medium, ); it( 'should handle mixed content types', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbin.org/html', 'https://httpbin.org/json', 'https://httpbin.org/xml'], }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Processed 3 URLs'); expect(text).toContain('https://httpbin.org/html: Success'); expect(text).toContain('https://httpbin.org/json: Success'); expect(text).toContain('https://httpbin.org/xml: Success'); }, TEST_TIMEOUTS.medium, ); it( 'should handle empty URL list', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: [], }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].text).toContain('Error'); // Just check that it's an error about invalid parameters expect(content[0].text?.toLowerCase()).toMatch(/error|invalid|failed/); }, TEST_TIMEOUTS.short, ); it( 'should reject session_id parameter', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbingo.org/html'], session_id: 'test-session', }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); expect(content[0].text).toContain('session_id'); expect(content[0].text).toContain('does not support'); expect(content[0].text).toContain('stateless'); }, TEST_TIMEOUTS.short, ); it( 'should handle per-URL configs array', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['https://httpbingo.org/html', 'https://httpbingo.org/json'], configs: [ { url: 'https://httpbingo.org/html', browser_config: { browser_type: 'chromium' }, crawler_config: { word_count_threshold: 10 }, }, { url: 'https://httpbingo.org/json', browser_config: { browser_type: 'firefox' }, crawler_config: { word_count_threshold: 20 }, }, ], max_concurrent: 2, }, }); const content = (result as ToolResult).content; expect(content).toHaveLength(1); expect(content[0].type).toBe('text'); const text = content[0].text || ''; expect(text).toContain('Batch crawl completed'); expect(text).toContain('Processed 2 URLs'); // Both should succeed regardless of different configs expect(text).toContain('https://httpbingo.org/html: Success'); expect(text).toContain('https://httpbingo.org/json: Success'); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/parse-sitemap.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('parse_sitemap Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Basic functionality', () => { it( 'should parse nodejs.org sitemap successfully', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://nodejs.org/sitemap.xml', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); expect(Array.isArray(content)).toBe(true); expect(content.length).toBeGreaterThan(0); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Sitemap parsed successfully'); expect(textContent?.text).toContain('Total URLs found:'); expect(textContent?.text).toContain('https://nodejs.org'); // Should find many URLs in the nodejs sitemap expect(textContent?.text).toMatch(/Total URLs found: [1-9][0-9]+/); }, TEST_TIMEOUTS.medium, ); it( 'should filter URLs with regex pattern', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://nodejs.org/sitemap.xml', filter_pattern: '.*/learn/.*', // Only URLs containing /learn/ }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Check that filtering worked expect(textContent?.text).toContain('Filtered URLs:'); // All URLs in the result should contain /learn/ const urlsSection = textContent?.text?.split('URLs:\n')[1]; if (urlsSection) { const urls = urlsSection.split('\n').filter((url) => url.trim()); urls.forEach((url) => { if (url && !url.includes('... and')) { expect(url).toContain('/learn/'); } }); } }, TEST_TIMEOUTS.medium, ); it( 'should handle empty sitemaps', async () => { // Using a URL that returns valid XML but not a sitemap const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://www.w3schools.com/xml/note.xml', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Total URLs found: 0'); }, TEST_TIMEOUTS.medium, ); it( 'should handle large sitemaps with truncation', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://nodejs.org/sitemap.xml', filter_pattern: '.*', // Match all to test truncation }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Should show max 100 URLs and indicate there are more if (textContent?.text && textContent.text.includes('... and')) { expect(textContent.text).toMatch(/\.\.\. and \d+ more/); } }, TEST_TIMEOUTS.medium, ); }); describe('Error handling', () => { it( 'should handle invalid URLs', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'not-a-url', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content).toBeDefined(); const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); expect(textContent?.text?.toLowerCase()).toContain('invalid'); }, TEST_TIMEOUTS.short, ); it( 'should handle non-existent URLs', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://this-domain-definitely-does-not-exist-12345.com/sitemap.xml', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); }, TEST_TIMEOUTS.medium, ); it( 'should handle non-XML content', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://example.com', // HTML page, not XML }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); // Should still parse but likely find 0 URLs since it's not a sitemap expect(textContent?.text).toContain('Total URLs found:'); }, TEST_TIMEOUTS.medium, ); it( 'should handle invalid regex patterns', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://nodejs.org/sitemap.xml', filter_pattern: '[invalid(regex', // Invalid regex }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; const textContent = content.find((c) => c.type === 'text'); expect(textContent).toBeDefined(); expect(textContent?.text).toContain('Error'); expect(textContent?.text?.toLowerCase()).toMatch(/failed|error|invalid/); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/crawl-handlers.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; }>; } describe('Crawl Handlers Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('batch_crawl error handling', () => { it( 'should handle batch crawl with invalid URLs', async () => { const result = await client.callTool({ name: 'batch_crawl', arguments: { urls: ['not-a-valid-url', 'https://this-domain-does-not-exist-12345.com'], max_concurrent: 2, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].type).toBe('text'); // Zod validation will catch the invalid URL format expect(content[0].text).toContain('Invalid parameters'); }, TEST_TIMEOUTS.medium, ); }); describe('smart_crawl edge cases', () => { it( 'should detect XML content type for XML URLs', async () => { const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://httpbin.org/xml', bypass_cache: true, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Smart crawl detected content type:'); // Should detect as XML based on content-type header expect(content[0].text?.toLowerCase()).toMatch(/xml|json/); // httpbin.org/xml actually returns JSON }, TEST_TIMEOUTS.medium, ); it( 'should handle follow_links with sitemap URLs', async () => { // Note: Most sites don't have accessible sitemaps, so this tests the logic const result = await client.callTool({ name: 'smart_crawl', arguments: { url: 'https://example.com/sitemap.xml', follow_links: true, max_depth: 2, bypass_cache: true, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Smart crawl detected content type:'); }, TEST_TIMEOUTS.long, // Increase timeout for sitemap processing ); }); describe('crawl_recursive edge cases', () => { it( 'should respect max_depth limit of 0', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/links/5/0', max_depth: 0, // Should only crawl the initial page }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; // The test might show 0 pages if the URL fails, or 1 page if it succeeds expect(content[0].text).toMatch(/Pages crawled: [01]/); // If pages were crawled, check for max depth message if (content[0].text?.includes('Pages crawled: 1')) { expect(content[0].text).toContain('Max depth reached: 0'); } }, TEST_TIMEOUTS.medium, ); it( 'should handle sites with no internal links', async () => { const result = await client.callTool({ name: 'crawl_recursive', arguments: { url: 'https://httpbin.org/json', // JSON endpoint has no links max_depth: 2, }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Pages crawled: 1'); expect(content[0].text).toContain('Internal links found: 0'); }, TEST_TIMEOUTS.medium, ); }); describe('parse_sitemap error handling', () => { it( 'should handle non-existent sitemap URLs', async () => { const result = await client.callTool({ name: 'parse_sitemap', arguments: { url: 'https://this-domain-does-not-exist-12345.com/sitemap.xml', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Error'); expect(content[0].text?.toLowerCase()).toMatch(/failed|error|not found/); }, TEST_TIMEOUTS.medium, ); }); describe('crawl method edge cases', () => { it( 'should handle crawl with all image and filtering parameters', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', word_count_threshold: 50, image_description_min_word_threshold: 10, image_score_threshold: 0.5, exclude_social_media_links: true, cache_mode: 'BYPASS', }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].type).toBe('text'); // Should successfully crawl with these parameters expect(content[0].text).not.toContain('Error'); }, TEST_TIMEOUTS.medium, ); it( 'should handle js_code as null with validation error', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', js_code: null as unknown as string, // Intentionally pass null }, }); expect(result).toBeDefined(); const content = (result as ToolResult).content; expect(content[0].text).toContain('Invalid parameters for crawl'); expect(content[0].text).toContain('js_code'); }, TEST_TIMEOUTS.short, ); it( 'should work with session_id parameter using manage_session', async () => { // First create a session using manage_session const sessionResult = await client.callTool({ name: 'manage_session', arguments: { action: 'create', session_id: 'test-crawl-session-new', }, }); expect(sessionResult).toBeDefined(); // Then use it for crawling const crawlResult = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', session_id: 'test-crawl-session-new', }, }); expect(crawlResult).toBeDefined(); const content = (crawlResult as ToolResult).content; expect(content[0].type).toBe('text'); expect(content[0].text).not.toContain('Error'); // Clean up using manage_session await client.callTool({ name: 'manage_session', arguments: { action: 'clear', session_id: 'test-crawl-session-new', }, }); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/integration/crawl-advanced.integration.test.ts: -------------------------------------------------------------------------------- ```typescript /* eslint-env jest */ import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { createTestClient, cleanupTestClient, expectSuccessfulCrawl, TEST_TIMEOUTS } from './test-utils.js'; interface ToolResult { content: Array<{ type: string; text?: string; data?: string; mimeType?: string; }>; } describe('crawl Advanced Features Integration Tests', () => { let client: Client; beforeAll(async () => { client = await createTestClient(); }, TEST_TIMEOUTS.medium); afterAll(async () => { if (client) { await cleanupTestClient(client); } }); describe('Media and Content Extraction', () => { it( 'should extract images with scoring', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', image_score_threshold: 3, exclude_external_images: false, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should have extracted content expect(textContent?.text).toContain('Example Domain'); }, TEST_TIMEOUTS.medium, ); it( 'should capture MHTML', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://example.com', capture_mhtml: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // MHTML should be captured but not in the text output expect(textContent?.text).toContain('Example Domain'); }, TEST_TIMEOUTS.long, ); it( 'should extract tables from Wikipedia', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)', word_count_threshold: 10, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should contain country data expect(textContent?.text).toMatch(/China|India|United States/); }, TEST_TIMEOUTS.long, ); }); describe('Link and Content Filtering', () => { it( 'should exclude social media links', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://www.bbc.com/news', exclude_social_media_links: true, exclude_domains: ['twitter.com', 'facebook.com', 'instagram.com'], cache_mode: 'BYPASS', word_count_threshold: 50, }, }); await expectSuccessfulCrawl(result); const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); expect(textContent?.text).toBeTruthy(); // Should have news content but no social media references in extracted links expect(textContent?.text).toContain('BBC'); }, TEST_TIMEOUTS.long, ); it( 'should remove excluded selectors', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', excluded_selector: 'div:first-child', cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Page Navigation Options', () => { it( 'should wait for images to load', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/image/png', wait_for_images: true, wait_until: 'load', page_timeout: 30000, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); it( 'should scan full page', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', scan_full_page: true, delay_before_scroll: 0.5, scroll_delay: 0.2, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); describe('Stealth and Bot Detection', () => { it( 'should use magic mode', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/headers', magic: true, simulate_user: true, override_navigator: true, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.long, ); }); describe('Extraction Strategies (0.7.3/0.7.4)', () => { it( 'should accept extraction_strategy parameter', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', extraction_strategy: { type: 'custom', provider: 'openai', api_key: 'test-key', model: 'gpt-4', }, cache_mode: 'BYPASS', }, }); // The parameter should be accepted even if not fully processed await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); it( 'should accept table_extraction_strategy parameter', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', table_extraction_strategy: { enable_chunking: true, thresholds: { min_rows: 5, max_columns: 20, }, }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); it( 'should accept markdown_generator_options parameter', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', markdown_generator_options: { include_links: true, preserve_formatting: true, }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.short, ); }); describe('Virtual Scroll', () => { it( 'should handle virtual scroll configuration', async () => { const result = await client.callTool({ name: 'crawl', arguments: { url: 'https://httpbin.org/html', virtual_scroll_config: { container_selector: 'body', scroll_count: 3, scroll_by: 'container_height', wait_after_scroll: 0.5, }, cache_mode: 'BYPASS', }, }); await expectSuccessfulCrawl(result); }, TEST_TIMEOUTS.medium, ); }); }); ``` -------------------------------------------------------------------------------- /src/__tests__/index.npx.test.ts: -------------------------------------------------------------------------------- ```typescript import { spawn } from 'child_process'; import * as path from 'path'; import * as url from 'url'; import * as fs from 'fs/promises'; const __dirname = url.fileURLToPath(new URL('.', import.meta.url)); describe('NPX Execution Tests', () => { // These tests ensure the package works when installed and run via npx // This prevents issues like the one in v2.6.11 where the server wouldn't start describe('Simulated NPX execution', () => { it('should start server when run from dist/index.js directly', async () => { // This simulates how npx runs the built package const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); // Check if dist/index.js exists (it should after build) try { await fs.access(distIndexPath); } catch { console.warn('Skipping test - dist/index.js not found. Run "npm run build" first.'); return; } const child = spawn('node', [distIndexPath], { env: { ...process.env, CRAWL4AI_BASE_URL: 'http://localhost:11235', CRAWL4AI_API_KEY: 'test-key', // Don't load .env file to simulate production NODE_ENV: 'production', }, stdio: 'pipe', }); let stderr = ''; child.stderr.on('data', (data) => { stderr += data.toString(); }); // Wait for server to start await new Promise<void>((resolve) => { const timeout = setTimeout(() => { child.kill(); resolve(); }, 2000); child.stderr.on('data', (data) => { const output = data.toString(); if (output.includes('started')) { clearTimeout(timeout); child.kill(); resolve(); } }); }); // Server should have started successfully expect(stderr).toContain('crawl4ai-mcp'); expect(stderr).toContain('started'); }); it('should start server without dotenv when env vars are provided', async () => { // This tests that we don't require dotenv in production const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); try { await fs.access(distIndexPath); } catch { console.warn('Skipping test - dist/index.js not found.'); return; } // Temporarily rename node_modules/dotenv to simulate it not being available const dotenvPath = path.join(__dirname, '..', '..', 'node_modules', 'dotenv'); const dotenvBackupPath = path.join(__dirname, '..', '..', 'node_modules', 'dotenv.backup'); let dotenvRenamed = false; try { // Only rename if dotenv exists try { await fs.access(dotenvPath); await fs.rename(dotenvPath, dotenvBackupPath); dotenvRenamed = true; } catch { // dotenv doesn't exist, which is fine for this test } const child = spawn('node', [distIndexPath], { env: { CRAWL4AI_BASE_URL: 'http://localhost:11235', CRAWL4AI_API_KEY: 'test-key', PATH: process.env.PATH, }, stdio: 'pipe', }); let stderr = ''; child.stderr.on('data', (data) => { stderr += data.toString(); }); // Wait for server to start await new Promise<void>((resolve) => { setTimeout(() => { child.kill(); resolve(); }, 2000); }); // Server should still start even without dotenv expect(stderr).toContain('crawl4ai-mcp'); expect(stderr).toContain('started'); } finally { // Restore dotenv if we renamed it if (dotenvRenamed) { await fs.rename(dotenvBackupPath, dotenvPath); } } }); it('should handle MCP protocol initialization', async () => { // This simulates the full MCP handshake that Claude Desktop does const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); try { await fs.access(distIndexPath); } catch { console.warn('Skipping test - dist/index.js not found.'); return; } const child = spawn('node', [distIndexPath], { env: { ...process.env, CRAWL4AI_BASE_URL: 'http://localhost:11235', CRAWL4AI_API_KEY: 'test-key', }, stdio: 'pipe', }); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); // Wait for server to start await new Promise((resolve) => setTimeout(resolve, 500)); // Send MCP initialization request (like Claude Desktop does) const initRequest = JSON.stringify({ jsonrpc: '2.0', method: 'initialize', params: { protocolVersion: '2025-06-18', capabilities: {}, clientInfo: { name: 'test-client', version: '1.0.0', }, }, id: 1, }) + '\n'; child.stdin.write(initRequest); // Wait for response await new Promise((resolve) => setTimeout(resolve, 1000)); // Parse the response const response = stdout.trim().split('\n').pop(); let parsed; try { parsed = JSON.parse(response || '{}'); } catch { // Response might not be valid JSON yet parsed = {}; } child.kill(); // Should have received an initialization response expect(stderr).toContain('started'); expect(parsed.id).toBe(1); expect(parsed.result).toBeDefined(); }); it('should fail gracefully when CRAWL4AI_BASE_URL is missing', async () => { const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); try { await fs.access(distIndexPath); } catch { console.warn('Skipping test - dist/index.js not found.'); return; } const child = spawn('node', [distIndexPath], { env: { // Explicitly set to empty string to prevent dotenv from loading CRAWL4AI_BASE_URL: '', PATH: process.env.PATH, }, stdio: 'pipe', }); let stderr = ''; child.stderr.on('data', (data) => { stderr += data.toString(); }); const exitCode = await new Promise<number | null>((resolve, reject) => { // Add timeout to prevent hanging const timeout = setTimeout(() => { child.kill('SIGTERM'); reject(new Error('Process timeout')); }, 10000); // 10 second timeout child.on('exit', (code) => { clearTimeout(timeout); resolve(code); }); child.on('error', (err) => { clearTimeout(timeout); reject(err); }); }); // Should exit with error code expect(exitCode).toBe(1); expect(stderr).toContain('CRAWL4AI_BASE_URL environment variable is required'); // Ensure cleanup child.kill(); }, 15000); // 15 second test timeout }); describe('NPX-specific edge cases', () => { it('should work with different Node.js execution paths', async () => { // NPX might use different paths for node execution const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); try { await fs.access(distIndexPath); } catch { console.warn('Skipping test - dist/index.js not found.'); return; } // Test with different argv[1] values that npx might use const testPaths = [ distIndexPath, '/tmp/npx-12345/node_modules/.bin/mcp-crawl4ai-ts', path.join(process.env.HOME || '', '.npm/_npx/12345/node_modules/mcp-crawl4ai-ts/dist/index.js'), ]; for (const testPath of testPaths) { const child = spawn('node', [distIndexPath], { env: { ...process.env, CRAWL4AI_BASE_URL: 'http://localhost:11235', // Simulate different execution contexts npm_execpath: testPath, }, stdio: 'pipe', }); let started = false; child.stderr.on('data', (data) => { if (data.toString().includes('started')) { started = true; } }); // Give it time to start await new Promise((resolve) => setTimeout(resolve, 500)); child.kill(); expect(started).toBe(true); } }); }); }); ``` -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- ```typescript export interface CrawlOptions { remove_images?: boolean; bypass_cache?: boolean; filter_mode?: 'blacklist' | 'whitelist'; filter_list?: string[]; screenshot?: boolean; wait_for?: string; timeout?: number; } export interface JSExecuteOptions { js_code: string | string[]; // Only url and js_code (scripts) are supported by /execute_js endpoint } export interface JSExecuteEndpointOptions { url: string; scripts: string | string[]; // Only url and scripts are supported by /execute_js endpoint } export interface JSExecuteEndpointResponse { success: boolean; js_execution_result: { success: boolean; results: unknown[]; }; markdown?: string | CrawlMarkdownResult; } export interface ScreenshotEndpointOptions { url: string; screenshot_wait_for?: number; save_to_directory?: string; // output_path is omitted to get base64 response } export interface ScreenshotEndpointResponse { success: boolean; screenshot: string; // base64 encoded image } export interface PDFEndpointOptions { url: string; // Only url is supported by /pdf endpoint } export interface PDFEndpointResponse { success: boolean; pdf: string; // base64 encoded PDF } export interface HTMLEndpointOptions { url: string; // Only url is supported by /html endpoint } export interface HTMLEndpointResponse { html: string; url: string; success: boolean; } export type FilterType = 'raw' | 'fit' | 'bm25' | 'llm'; export interface MarkdownEndpointOptions { url: string; f?: FilterType; // Filter type: raw, fit (default), bm25, llm q?: string; // Query string for bm25/llm filters c?: string; // Cache-bust parameter } export interface MarkdownEndpointResponse { url: string; filter: string; query: string | null; cache: string; markdown: string; success: boolean; } export interface LLMEndpointOptions { url: string; query: string; } export interface LLMEndpointResponse { answer: string; } export interface BatchCrawlOptions extends CrawlOptions { urls: string[]; max_concurrent?: number; // New: Support per-URL configs array (0.7.3/0.7.4) configs?: Array<{ url: string; browser_config?: BrowserConfig; crawler_config?: CrawlerConfig; extraction_strategy?: ExtractionStrategy; table_extraction_strategy?: TableExtractionStrategy; markdown_generator_options?: MarkdownGeneratorOptions; matcher?: string | ((url: string) => boolean); }>; } // Browser configuration options export interface BrowserConfig { browser_type?: 'chromium' | 'firefox' | 'webkit' | 'undetected'; headless?: boolean; viewport_width?: number; viewport_height?: number; user_agent?: string; // Unified proxy config - accepts string or object format (new in 0.7.3/0.7.4) proxy?: | string | { server: string; username?: string; password?: string; }; // Legacy field kept for backward compatibility proxy_config?: { server: string; username?: string; password?: string; }; cookies?: Array<{ name: string; value: string; domain: string; path?: string; }>; headers?: Record<string, string>; extra_args?: string[]; } // Virtual scroll configuration for sites like Twitter/Instagram export interface VirtualScrollConfig { container_selector: string; scroll_count?: number; scroll_by?: string | number; wait_after_scroll?: number; } // Crawler configuration options export interface CrawlerConfig { // Content filtering word_count_threshold?: number; excluded_tags?: string[]; excluded_selector?: string; remove_overlay_elements?: boolean; only_text?: boolean; remove_forms?: boolean; keep_data_attributes?: boolean; // JavaScript execution js_code?: string | string[]; js_only?: boolean; wait_for?: string; wait_for_timeout?: number; // Page navigation & timing wait_until?: 'domcontentloaded' | 'networkidle' | 'load'; page_timeout?: number; wait_for_images?: boolean; ignore_body_visibility?: boolean; // Dynamic content handling delay_before_scroll?: number; scroll_delay?: number; scan_full_page?: boolean; virtual_scroll_config?: VirtualScrollConfig; // Content processing process_iframes?: boolean; exclude_external_links?: boolean; // Media handling screenshot?: boolean; screenshot_wait_for?: number; pdf?: boolean; capture_mhtml?: boolean; image_description_min_word_threshold?: number; image_score_threshold?: number; exclude_external_images?: boolean; // Link filtering exclude_social_media_links?: boolean; exclude_domains?: string[]; // Page interaction simulate_user?: boolean; override_navigator?: boolean; magic?: boolean; // Session management session_id?: string; // Cache control cache_mode?: 'ENABLED' | 'BYPASS' | 'DISABLED'; // Performance options timeout?: number; verbose?: boolean; // Debug log_console?: boolean; // New parameters from 0.7.3/0.7.4 delay_before_return_html?: number; // Delay in ms before capturing final HTML css_selector?: string; // CSS selector to extract specific elements include_links?: boolean; // Whether to include links in the response resolve_absolute_urls?: boolean; // Convert relative URLs to absolute ones } // Extraction strategy passthrough objects (new in 0.7.3/0.7.4) export interface ExtractionStrategy { [key: string]: unknown; } export interface TableExtractionStrategy { enable_chunking?: boolean; thresholds?: Record<string, unknown>; [key: string]: unknown; } export interface MarkdownGeneratorOptions { include_links?: boolean; [key: string]: unknown; } // Advanced crawl configuration combining browser and crawler configs export interface AdvancedCrawlConfig { url?: string; urls?: string[]; browser_config?: BrowserConfig; crawler_config?: CrawlerConfig; priority?: number; extraction_strategy?: ExtractionStrategy; table_extraction_strategy?: TableExtractionStrategy; markdown_generator_options?: MarkdownGeneratorOptions; } // Session management types (used internally by MCP server) export interface SessionInfo { id: string; created_at: Date; last_used: Date; initial_url?: string; metadata?: Record<string, unknown>; } // Crawl endpoint types export interface CrawlEndpointOptions { urls: string[]; browser_config?: BrowserConfig; crawler_config?: CrawlerConfig; } export interface CrawlMarkdownResult { raw_markdown: string; markdown_with_citations: string; references_markdown: string; fit_markdown: string; fit_html: string; } export interface CrawlMediaResult { images: Array<{ src?: string | null; data?: string; alt?: string | null; desc?: string; score?: number; type?: string; group_id?: number; format?: string | null; width?: number | null; }>; videos: Array<{ src?: string | null; data?: string; alt?: string | null; desc?: string; score?: number; type?: string; group_id?: number; format?: string | null; width?: number | null; }>; audios: Array<{ src?: string | null; data?: string; alt?: string | null; desc?: string; score?: number; type?: string; group_id?: number; format?: string | null; width?: number | null; }>; } interface LinkItem { href: string; text: string; title: string; base_domain?: string | null; head_data?: Record<string, unknown> | null; head_extraction_status?: string | null; head_extraction_error?: string | null; intrinsic_score?: number; contextual_score?: number | null; total_score?: number | null; } export interface CrawlLinksResult { internal: LinkItem[]; external: LinkItem[]; } export interface CrawlResultItem { url: string; html: string; cleaned_html: string; fit_html: string; success: boolean; error_message?: string; status_code: number; response_headers: Record<string, unknown>; redirected_url?: string; session_id: string | null; metadata: Record<string, unknown>; links: CrawlLinksResult; media: CrawlMediaResult; markdown: CrawlMarkdownResult; tables: unknown[]; extracted_content: unknown | null; screenshot: string | null; // base64 PNG when screenshot: true pdf: string | null; // base64 PDF when pdf: true mhtml: string | null; js_execution_result: { success: boolean; results: unknown[]; } | null; downloaded_files: unknown | null; network_requests: unknown | null; console_messages: unknown | null; ssl_certificate: unknown | null; dispatch_result: unknown | null; } export interface CrawlEndpointResponse { success: boolean; results: CrawlResultItem[]; server_processing_time_s: number; server_memory_delta_mb: number; server_peak_memory_mb: number; } ``` -------------------------------------------------------------------------------- /src/schemas/validation-schemas.ts: -------------------------------------------------------------------------------- ```typescript import { z } from 'zod'; import { validateJavaScriptCode, createStatelessSchema } from './helpers.js'; export const JsCodeSchema = z .union([ z.string().refine(validateJavaScriptCode, { message: 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', }), z.array( z.string().refine(validateJavaScriptCode, { message: 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', }), ), ]) .describe('JavaScript code as string or array of strings'); export const VirtualScrollConfigSchema = z.object({ container_selector: z.string(), scroll_count: z.number().optional(), scroll_by: z.union([z.string(), z.number()]).optional(), wait_after_scroll: z.number().optional(), }); const GetMarkdownBaseSchema = z.object({ url: z.string().url(), filter: z.enum(['raw', 'fit', 'bm25', 'llm']).optional().default('fit'), query: z.string().optional(), cache: z.string().optional().default('0'), }); export const GetMarkdownSchema = createStatelessSchema(GetMarkdownBaseSchema, 'get_markdown').refine( (data) => { // If filter is bm25 or llm, query is required if ((data.filter === 'bm25' || data.filter === 'llm') && !data.query) { return false; } return true; }, { message: 'Query parameter is required when using bm25 or llm filter', path: ['query'], }, ); export const ExecuteJsSchema = createStatelessSchema( z.object({ url: z.string().url(), scripts: JsCodeSchema, }), 'execute_js', ); export const GetHtmlSchema = createStatelessSchema( z.object({ url: z.string().url(), }), 'get_html', ); export const CaptureScreenshotSchema = createStatelessSchema( z.object({ url: z.string().url(), screenshot_wait_for: z.number().optional(), save_to_directory: z.string().optional().describe('Local directory to save screenshot file'), // output_path not exposed as MCP needs base64 data }), 'capture_screenshot', ); export const GeneratePdfSchema = createStatelessSchema( z.object({ url: z.string().url(), // Only url is supported - output_path not exposed as MCP needs base64 data }), 'generate_pdf', ); export const ExtractWithLlmSchema = createStatelessSchema( z.object({ url: z.string().url(), query: z.string(), }), 'extract_with_llm', ); export const BatchCrawlSchema = createStatelessSchema( z.object({ urls: z.array(z.string().url()), max_concurrent: z.number().optional(), remove_images: z.boolean().optional(), bypass_cache: z.boolean().optional(), // New: Support per-URL configs array (0.7.3/0.7.4) configs: z .array( z.object({ url: z.string().url(), browser_config: z.record(z.unknown()).optional(), crawler_config: z.record(z.unknown()).optional(), extraction_strategy: z.record(z.unknown()).optional(), table_extraction_strategy: z.record(z.unknown()).optional(), markdown_generator_options: z.record(z.unknown()).optional(), matcher: z.union([z.string(), z.function()]).optional(), }), ) .optional(), }), 'batch_crawl', ); export const SmartCrawlSchema = createStatelessSchema( z.object({ url: z.string().url(), max_depth: z.number().optional(), follow_links: z.boolean().optional(), bypass_cache: z.boolean().optional(), }), 'smart_crawl', ); export const ExtractLinksSchema = createStatelessSchema( z.object({ url: z.string().url(), categorize: z.boolean().optional().default(true), }), 'extract_links', ); export const CrawlRecursiveSchema = createStatelessSchema( z.object({ url: z.string().url(), max_depth: z.number().optional(), max_pages: z.number().optional(), include_pattern: z.string().optional(), exclude_pattern: z.string().optional(), }), 'crawl_recursive', ); export const ParseSitemapSchema = createStatelessSchema( z.object({ url: z.string().url(), filter_pattern: z.string().optional(), }), 'parse_sitemap', ); // Unified session management schema export const ManageSessionSchema = z.discriminatedUnion('action', [ z.object({ action: z.literal('create'), session_id: z.string().optional(), initial_url: z.string().url().optional(), browser_type: z.enum(['chromium', 'firefox', 'webkit']).optional(), }), z.object({ action: z.literal('clear'), session_id: z.string(), }), z.object({ action: z.literal('list'), }), ]); export const CrawlSchema = z .object({ url: z.string().url(), // Browser configuration browser_type: z.enum(['chromium', 'firefox', 'webkit']).optional(), viewport_width: z.number().optional(), viewport_height: z.number().optional(), user_agent: z.string().optional(), proxy_server: z.string().optional(), proxy_username: z.string().optional(), proxy_password: z.string().optional(), cookies: z .array( z.object({ name: z.string(), value: z.string(), domain: z.string(), path: z.string().optional(), }), ) .optional(), headers: z.record(z.string()).optional(), extra_args: z.array(z.string()).optional(), // Content filtering word_count_threshold: z.number().optional(), excluded_tags: z.array(z.string()).optional(), excluded_selector: z.string().optional(), remove_overlay_elements: z.boolean().optional(), only_text: z.boolean().optional(), remove_forms: z.boolean().optional(), keep_data_attributes: z.boolean().optional(), // JavaScript execution js_code: JsCodeSchema.optional(), js_only: z.boolean().optional(), wait_for: z.string().optional(), wait_for_timeout: z.number().optional(), // Page navigation & timing wait_until: z.enum(['domcontentloaded', 'networkidle', 'load']).optional(), page_timeout: z.number().optional(), wait_for_images: z.boolean().optional(), ignore_body_visibility: z.boolean().optional(), // Dynamic content delay_before_scroll: z.number().optional(), scroll_delay: z.number().optional(), scan_full_page: z.boolean().optional(), virtual_scroll_config: VirtualScrollConfigSchema.optional(), // Content processing process_iframes: z.boolean().optional(), exclude_external_links: z.boolean().optional(), // Media handling screenshot: z.boolean().optional(), screenshot_wait_for: z.number().optional(), screenshot_directory: z .string() .optional() .describe('Local directory to save screenshot file when screenshot=true'), pdf: z.boolean().optional(), capture_mhtml: z.boolean().optional(), image_description_min_word_threshold: z.number().optional(), image_score_threshold: z.number().optional(), exclude_external_images: z.boolean().optional(), // Link filtering exclude_social_media_links: z.boolean().optional(), exclude_domains: z.array(z.string()).optional(), // Page interaction simulate_user: z.boolean().optional(), override_navigator: z.boolean().optional(), magic: z.boolean().optional(), // Session and cache session_id: z.string().optional(), cache_mode: z.enum(['ENABLED', 'BYPASS', 'DISABLED']).optional(), // Performance options timeout: z.number().optional(), verbose: z.boolean().optional(), // Debug log_console: z.boolean().optional(), // New parameters from 0.7.3/0.7.4 delay_before_return_html: z.number().optional(), css_selector: z.string().optional(), include_links: z.boolean().optional(), resolve_absolute_urls: z.boolean().optional(), }) .refine( (data) => { // js_only is for subsequent calls in same session, not first call // Using it incorrectly causes server errors if (data.js_only && !data.session_id) { return false; } return true; }, { message: "Error: js_only requires session_id (it's for continuing existing sessions).\n" + 'For first call with js_code, use: {js_code: [...], screenshot: true}\n' + 'For multi-step: First {js_code: [...], session_id: "x"}, then {js_only: true, session_id: "x"}', }, ) .refine( (data) => { // Empty js_code array is not allowed if (Array.isArray(data.js_code) && data.js_code.length === 0) { return false; } return true; }, { message: 'Error: js_code array cannot be empty. Either provide JavaScript code to execute or remove the js_code parameter entirely.', }, ); // Re-export types we need export type { z }; ``` -------------------------------------------------------------------------------- /src/__tests__/schemas/validation-edge-cases.test.ts: -------------------------------------------------------------------------------- ```typescript // import { jest } from '@jest/globals'; import { validateJavaScriptCode } from '../../schemas/helpers.js'; import { JsCodeSchema, CrawlSchema } from '../../schemas/validation-schemas.js'; describe('JavaScript Validation Edge Cases', () => { describe('validateJavaScriptCode', () => { describe('Valid JavaScript that might look suspicious', () => { it('should accept strings containing HTML-like syntax in string literals', () => { const validCases = [ `const html = '<div class="test">Hello</div>';`, `const template = \`<button onclick="alert('test')">Click</button>\`;`, `const regex = /<div[^>]*>/g;`, `const arrow = () => { return '<span>Arrow</span>'; }`, `const className = 'container';`, ]; validCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(true); }); }); it('should accept legitimate escape sequences', () => { const validCases = [ `const str = "Line 1\\nLine 2";`, // Real newline escape `const tab = "Col1\\tCol2";`, `const quote = "He said \\"Hello\\"";`, `const unicode = "\\u0048\\u0065\\u006C\\u006C\\u006F";`, `const template = \`Multi line string\`;`, // Real newlines in template literals ]; validCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(true); }); }); it('should accept complex but valid JavaScript patterns', () => { const validCases = [ // Nested template literals `const nested = \`Outer \${inner ? \`Inner: \${value}\` : 'None'}\`;`, // Regular expressions that might look like HTML `const htmlTag = /<([a-z]+)([^>]*)>/gi;`, // JSON strings without HTML entities `const json = '{"name": "Test", "value": "Some data"}';`, // Function with HTML in comments `function render() { // This creates div content return document.createElement('div'); }`, // Complex string concatenation `const result = '<div' + ' class="' + className + '">' + content + '</div>';`, ]; validCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(true); }); }); it('should accept Unicode and special characters', () => { const validCases = [ `const emoji = "Hello 👋 World 🌍";`, `const chinese = "你好世界";`, `const arabic = "مرحبا بالعالم";`, `const special = "©2024 Company™";`, `const math = "∑(n=1 to ∞) = π²/6";`, ]; validCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(true); }); }); }); describe('Invalid JavaScript that should be rejected', () => { it('should reject HTML entities outside string literals', () => { const invalidCases = [ `const value = "test";`, // HTML entities as code `const text = && true;`, `if (a < b) { }`, `const escaped = `, `return 'hello';`, ]; invalidCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(false); }); }); it('should reject literal backslash-n outside strings', () => { const invalidCases = [ `const text = "Hello";\\nconst world = "World";`, // Literal \n between statements `console.log("test");\\nconsole.log("more");`, `return value;\\nreturn other;`, ]; invalidCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(false); }); }); it('should reject HTML tags outside string literals', () => { const invalidCases = [ `<script>alert('test')</script>`, `<!DOCTYPE html>`, `<html><body>test</body></html>`, `<style>body { color: red; }</style>`, ]; invalidCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(false); }); }); }); describe('Edge cases and boundaries', () => { it('should handle empty and whitespace-only input', () => { expect(validateJavaScriptCode('')).toBe(true); expect(validateJavaScriptCode(' ')).toBe(true); expect(validateJavaScriptCode('\n\n\n')).toBe(true); expect(validateJavaScriptCode('\t')).toBe(true); }); it('should handle very long valid strings', () => { const longString = 'const x = "' + 'a'.repeat(10000) + '";'; expect(validateJavaScriptCode(longString)).toBe(true); }); it('should handle nested quotes correctly', () => { const validCases = [ `const x = "She said \\"Hello\\" to me";`, `const y = 'It\\'s a nice day';`, `const z = \`Template with "quotes" and 'apostrophes'\`;`, ]; validCases.forEach((code) => { expect(validateJavaScriptCode(code)).toBe(true); }); }); it('should handle multiline strings correctly', () => { const multiline = ` const longText = \` This is a multiline template literal with multiple lines \`;`; expect(validateJavaScriptCode(multiline)).toBe(true); }); }); }); describe('Schema Validation Edge Cases', () => { describe('JsCodeSchema', () => { it('should accept both string and array of strings', () => { expect(() => JsCodeSchema.parse('return 1;')).not.toThrow(); expect(() => JsCodeSchema.parse(['return 1;', 'return 2;'])).not.toThrow(); }); it('should reject invalid JavaScript in arrays', () => { expect(() => JsCodeSchema.parse(['valid();', '"invalid"'])).toThrow(); }); it('should handle empty arrays', () => { expect(() => JsCodeSchema.parse([])).not.toThrow(); }); }); describe('CrawlSchema edge cases', () => { it('should handle all optional parameters', () => { const minimal = { url: 'https://example.com' }; expect(() => CrawlSchema.parse(minimal)).not.toThrow(); }); it('should validate js_only requires session_id', () => { const invalid = { url: 'https://example.com', js_only: true, // Missing session_id }; expect(() => CrawlSchema.parse(invalid)).toThrow(); }); it('should reject empty js_code array', () => { const invalid = { url: 'https://example.com', js_code: [], }; expect(() => CrawlSchema.parse(invalid)).toThrow(); }); it('should accept all valid cache modes', () => { const validModes = ['ENABLED', 'BYPASS', 'DISABLED']; validModes.forEach((mode) => { const config = { url: 'https://example.com', cache_mode: mode }; expect(() => CrawlSchema.parse(config)).not.toThrow(); }); }); it('should validate viewport dimensions', () => { const validViewport = { url: 'https://example.com', viewport_width: 1920, viewport_height: 1080, }; expect(() => CrawlSchema.parse(validViewport)).not.toThrow(); }); it('should validate complex configurations', () => { const complex = { url: 'https://example.com', browser_type: 'chromium', viewport_width: 1280, viewport_height: 720, user_agent: 'Custom User Agent', headers: { 'X-Custom': 'value' }, cookies: [{ name: 'session', value: '123', domain: '.example.com' }], js_code: ['document.querySelector("button").click()'], wait_for: '#loaded', screenshot: true, pdf: true, session_id: 'test-session', cache_mode: 'BYPASS', }; expect(() => CrawlSchema.parse(complex)).not.toThrow(); }); }); }); describe('Property-based testing for regex patterns', () => { // Generate random valid JavaScript-like strings const generateValidJS = () => { const templates = [ () => `const x = ${Math.random()};`, () => `function test() { return "${Math.random()}"; }`, () => `if (${Math.random() > 0.5}) { console.log("test"); }`, () => `const arr = [${Math.random()}, ${Math.random()}];`, () => `// Comment with ${Math.random()}`, ]; return templates[Math.floor(Math.random() * templates.length)](); }; it('should consistently validate generated valid JavaScript', () => { for (let i = 0; i < 100; i++) { const code = generateValidJS(); expect(validateJavaScriptCode(code)).toBe(true); } }); // Test boundary conditions with special characters const specialChars = ['<', '>', '&', '"', "'", '\\', '\n', '\r', '\t']; it('should handle special characters in string contexts correctly', () => { specialChars.forEach((char) => { const inString = `const x = "${char}";`; const inTemplate = `const y = \`${char}\`;`; // These should be valid (special chars inside strings) expect(validateJavaScriptCode(inString)).toBe(true); expect(validateJavaScriptCode(inTemplate)).toBe(true); }); }); }); }); ``` -------------------------------------------------------------------------------- /src/handlers/utility-handlers.ts: -------------------------------------------------------------------------------- ```typescript import { BaseHandler } from './base-handler.js'; import { JSExecuteEndpointOptions, JSExecuteEndpointResponse, CrawlResultItem } from '../types.js'; export class UtilityHandlers extends BaseHandler { async executeJS(options: JSExecuteEndpointOptions) { try { // Check if scripts is provided if (!options.scripts || options.scripts === null) { throw new Error( 'scripts is required. Please provide JavaScript code to execute. Use "return" statements to get values back.', ); } const result: JSExecuteEndpointResponse = await this.service.executeJS(options); // Extract JavaScript execution results const jsResults = result.js_execution_result?.results || []; // Ensure scripts is always an array for mapping const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts]; // Format results for display let formattedResults = ''; if (jsResults.length > 0) { formattedResults = jsResults .map((res: unknown, idx: number) => { const script = scripts[idx] || 'Script ' + (idx + 1); // Handle the actual return value or success/error status let resultStr = ''; if (res && typeof res === 'object' && 'success' in res) { // This is a status object (e.g., from null return or execution without return) const statusObj = res as { success: unknown; error?: unknown }; resultStr = statusObj.success ? 'Executed successfully (no return value)' : `Error: ${statusObj.error || 'Unknown error'}`; } else { // This is an actual return value resultStr = JSON.stringify(res, null, 2); } return `Script: ${script}\nReturned: ${resultStr}`; }) .join('\n\n'); } else { formattedResults = 'No results returned'; } // Handle markdown content - can be string or object let markdownContent = ''; if (result.markdown) { if (typeof result.markdown === 'string') { markdownContent = result.markdown; } else if (typeof result.markdown === 'object' && result.markdown.raw_markdown) { // Use raw_markdown from the object structure markdownContent = result.markdown.raw_markdown; } } return { content: [ { type: 'text', text: `JavaScript executed on: ${options.url}\n\nResults:\n${formattedResults}${markdownContent ? `\n\nPage Content After Execution:\n${markdownContent}` : ''}`, }, ], }; } catch (error) { throw this.formatError(error, 'execute JavaScript'); } } async extractLinks(options: { url: string; categorize?: boolean }) { try { // Use crawl endpoint instead of md to get full link data const response = await this.axiosClient.post('/crawl', { urls: [options.url], crawler_config: { cache_mode: 'bypass', }, }); const results = response.data.results || [response.data]; const result: CrawlResultItem = results[0] || {}; // Variables for manually extracted links let manuallyExtractedInternal: string[] = []; let manuallyExtractedExternal: string[] = []; let hasManuallyExtractedLinks = false; // Check if the response is likely JSON or non-HTML content if (!result.links || (result.links.internal.length === 0 && result.links.external.length === 0)) { // Try to detect if this might be a JSON endpoint const markdownContent = result.markdown?.raw_markdown || result.markdown?.fit_markdown || ''; const htmlContent = result.html || ''; // Check for JSON indicators if ( // Check URL pattern options.url.includes('/api/') || options.url.includes('/api.') || // Check content type (often shown in markdown conversion) markdownContent.includes('application/json') || // Check for JSON structure patterns (markdownContent.startsWith('{') && markdownContent.endsWith('}')) || (markdownContent.startsWith('[') && markdownContent.endsWith(']')) || // Check HTML for JSON indicators htmlContent.includes('application/json') || // Common JSON patterns markdownContent.includes('"links"') || markdownContent.includes('"url"') || markdownContent.includes('"data"') ) { return { content: [ { type: 'text', text: `Note: ${options.url} appears to return JSON data rather than HTML. The extract_links tool is designed for HTML pages with <a> tags. To extract URLs from JSON, you would need to parse the JSON structure directly.`, }, ], }; } // If no links found but it's HTML, let's check the markdown content for href patterns if (markdownContent && markdownContent.includes('href=')) { // Extract links manually from markdown if server didn't provide them const hrefPattern = /href=["']([^"']+)["']/g; const foundLinks: string[] = []; let match; while ((match = hrefPattern.exec(markdownContent)) !== null) { foundLinks.push(match[1]); } if (foundLinks.length > 0) { hasManuallyExtractedLinks = true; // Categorize found links const currentDomain = new URL(options.url).hostname; foundLinks.forEach((link) => { try { const linkUrl = new URL(link, options.url); if (linkUrl.hostname === currentDomain) { manuallyExtractedInternal.push(linkUrl.href); } else { manuallyExtractedExternal.push(linkUrl.href); } } catch { // Relative link manuallyExtractedInternal.push(link); } }); } } } // Handle both cases: API-provided links and manually extracted links let internalUrls: string[] = []; let externalUrls: string[] = []; if (result.links && (result.links.internal.length > 0 || result.links.external.length > 0)) { // Use API-provided links internalUrls = result.links.internal.map((link) => (typeof link === 'string' ? link : link.href)); externalUrls = result.links.external.map((link) => (typeof link === 'string' ? link : link.href)); } else if (hasManuallyExtractedLinks) { // Use manually extracted links internalUrls = manuallyExtractedInternal; externalUrls = manuallyExtractedExternal; } const allUrls = [...internalUrls, ...externalUrls]; if (!options.categorize) { return { content: [ { type: 'text', text: `All links from ${options.url}:\n${allUrls.join('\n')}`, }, ], }; } // Categorize links const categorized: Record<string, string[]> = { internal: [], external: [], social: [], documents: [], images: [], scripts: [], }; // Further categorize links const socialDomains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'youtube.com']; const docExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']; const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp']; const scriptExtensions = ['.js', '.css']; // Categorize internal URLs internalUrls.forEach((href: string) => { if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.documents.push(href); } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.images.push(href); } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.scripts.push(href); } else { categorized.internal.push(href); } }); // Categorize external URLs externalUrls.forEach((href: string) => { if (socialDomains.some((domain) => href.includes(domain))) { categorized.social.push(href); } else if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.documents.push(href); } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.images.push(href); } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { categorized.scripts.push(href); } else { categorized.external.push(href); } }); // Return based on categorize option (defaults to true) if (options.categorize) { return { content: [ { type: 'text', text: `Link analysis for ${options.url}:\n\n${Object.entries(categorized) .map( ([category, links]: [string, string[]]) => `${category} (${links.length}):\n${links.slice(0, 10).join('\n')}${links.length > 10 ? '\n...' : ''}`, ) .join('\n\n')}`, }, ], }; } else { // Return simple list without categorization const allLinks = [...internalUrls, ...externalUrls]; return { content: [ { type: 'text', text: `All links from ${options.url} (${allLinks.length} total):\n\n${allLinks.slice(0, 50).join('\n')}${allLinks.length > 50 ? '\n...' : ''}`, }, ], }; } } catch (error) { throw this.formatError(error, 'extract links'); } } } ``` -------------------------------------------------------------------------------- /src/crawl4ai-service.ts: -------------------------------------------------------------------------------- ```typescript import axios, { AxiosInstance, AxiosError } from 'axios'; import { BatchCrawlOptions, AdvancedCrawlConfig, CrawlEndpointOptions, CrawlEndpointResponse, JSExecuteEndpointOptions, JSExecuteEndpointResponse, ScreenshotEndpointOptions, ScreenshotEndpointResponse, PDFEndpointOptions, PDFEndpointResponse, HTMLEndpointOptions, HTMLEndpointResponse, MarkdownEndpointOptions, MarkdownEndpointResponse, LLMEndpointOptions, LLMEndpointResponse, } from './types.js'; // Helper to validate JavaScript code const validateJavaScriptCode = (code: string): boolean => { // Check for common HTML entities that shouldn't be in JS if (/"|&|<|>|&#\d+;|&\w+;/.test(code)) { return false; } // Basic check to ensure it's not HTML if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) { return false; } // Check for literal \n, \t, \r outside of strings (common LLM mistake) // Look for patterns like: ;\n or }\n or )\n which suggest literal newlines if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) { return false; } // Check for obvious cases of literal \n between statements if (/[;})]\s*\\n\s*\w/.test(code)) { return false; } return true; }; // Helper to validate URL format const validateURL = (url: string): boolean => { try { new URL(url); return true; } catch { return false; } }; // Helper to handle axios errors consistently const handleAxiosError = (error: unknown): never => { if (axios.isAxiosError(error)) { const axiosError = error as AxiosError; // Handle timeout errors if (axiosError.code === 'ECONNABORTED') { throw new Error('Request timed out'); } if (axiosError.code === 'ETIMEDOUT') { throw new Error('Request timeout'); } // Handle network errors if (axiosError.code === 'ENOTFOUND') { throw new Error(`DNS resolution failed: ${axiosError.message}`); } if (axiosError.code === 'ECONNREFUSED') { throw new Error(`Connection refused: ${axiosError.message}`); } if (axiosError.code === 'ECONNRESET') { throw new Error(`Connection reset: ${axiosError.message}`); } if (axiosError.code === 'ENETUNREACH') { throw new Error(`Network unreachable: ${axiosError.message}`); } // Handle HTTP errors if (axiosError.response) { const status = axiosError.response.status; const data = axiosError.response.data as any; // eslint-disable-line @typescript-eslint/no-explicit-any const message = data?.error || data?.detail || data?.message || axiosError.message; throw new Error(`Request failed with status ${status}: ${message}`); } // Handle request errors (e.g., invalid URL) if (axiosError.request) { throw new Error(`Request failed: ${axiosError.message}`); } } // Re-throw unknown errors throw error; }; export class Crawl4AIService { private axiosClient: AxiosInstance; constructor(baseURL: string, apiKey: string) { this.axiosClient = axios.create({ baseURL, headers: { 'X-API-Key': apiKey, 'Content-Type': 'application/json', }, timeout: 120000, }); } async getMarkdown(options: MarkdownEndpointOptions): Promise<MarkdownEndpointResponse> { // Validate URL if (!validateURL(options.url)) { throw new Error('Invalid URL format'); } try { const response = await this.axiosClient.post('/md', { url: options.url, f: options.f, q: options.q, c: options.c, }); return response.data; } catch (error) { return handleAxiosError(error); } } async captureScreenshot(options: ScreenshotEndpointOptions): Promise<ScreenshotEndpointResponse> { // Validate URL if (!validateURL(options.url)) { throw new Error('Invalid URL format'); } try { const response = await this.axiosClient.post('/screenshot', { url: options.url, screenshot_wait_for: options.screenshot_wait_for, // output_path is omitted to get base64 response }); return response.data; } catch (error) { return handleAxiosError(error); } } async generatePDF(options: PDFEndpointOptions): Promise<PDFEndpointResponse> { // Validate URL if (!validateURL(options.url)) { throw new Error('Invalid URL format'); } try { const response = await this.axiosClient.post('/pdf', { url: options.url, // output_path is omitted to get base64 response }); return response.data; } catch (error) { return handleAxiosError(error); } } async executeJS(options: JSExecuteEndpointOptions): Promise<JSExecuteEndpointResponse> { // Validate URL if (!validateURL(options.url)) { throw new Error('Invalid URL format'); } // Ensure scripts is always an array const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts]; // Validate each script for (const script of scripts) { if (!validateJavaScriptCode(script)) { throw new Error( 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', ); } } try { const response = await this.axiosClient.post('/execute_js', { url: options.url, scripts: scripts, // Always send as array // Only url and scripts are supported by the endpoint }); return response.data; } catch (error) { return handleAxiosError(error); } } async batchCrawl(options: BatchCrawlOptions) { // Validate URLs if (!options.urls || options.urls.length === 0) { throw new Error('URLs array cannot be empty'); } // Build crawler config if needed const crawler_config: Record<string, unknown> = {}; // Handle remove_images by using exclude_tags if (options.remove_images) { crawler_config.exclude_tags = ['img', 'picture', 'svg']; } if (options.bypass_cache) { crawler_config.cache_mode = 'BYPASS'; } try { const response = await this.axiosClient.post('/crawl', { urls: options.urls, max_concurrent: options.max_concurrent, crawler_config: Object.keys(crawler_config).length > 0 ? crawler_config : undefined, }); return response.data; } catch (error) { return handleAxiosError(error); } } async getHTML(options: HTMLEndpointOptions): Promise<HTMLEndpointResponse> { // Validate URL if (!validateURL(options.url)) { throw new Error('Invalid URL format'); } try { const response = await this.axiosClient.post('/html', { url: options.url, // Only url is supported by the endpoint }); return response.data; } catch (error) { return handleAxiosError(error); } } async parseSitemap(url: string) { try { // Use axios directly without baseURL for fetching external URLs const response = await axios.get(url); return response.data; } catch (error) { return handleAxiosError(error); } } async detectContentType(url: string): Promise<string> { try { // Use axios directly without baseURL for external URLs const response = await axios.head(url); return response.headers['content-type'] || ''; } catch { return ''; } } async crawl(options: AdvancedCrawlConfig): Promise<CrawlEndpointResponse> { // Validate JS code if present if (options.crawler_config?.js_code) { const scripts = Array.isArray(options.crawler_config.js_code) ? options.crawler_config.js_code : [options.crawler_config.js_code]; for (const script of scripts) { if (!validateJavaScriptCode(script)) { throw new Error( 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', ); } } } // Server only accepts urls array, not url string const urls = options.url ? [options.url] : options.urls || []; const requestBody: CrawlEndpointOptions & { extraction_strategy?: unknown; table_extraction_strategy?: unknown; markdown_generator_options?: unknown; } = { urls, browser_config: options.browser_config, crawler_config: options.crawler_config || {}, // Always include crawler_config, even if empty }; // Add extraction strategy passthrough fields if present if (options.extraction_strategy) { requestBody.extraction_strategy = options.extraction_strategy; } if (options.table_extraction_strategy) { requestBody.table_extraction_strategy = options.table_extraction_strategy; } if (options.markdown_generator_options) { requestBody.markdown_generator_options = options.markdown_generator_options; } try { const response = await this.axiosClient.post('/crawl', requestBody); return response.data; } catch (error) { return handleAxiosError(error); } } async extractWithLLM(options: LLMEndpointOptions): Promise<LLMEndpointResponse> { // Validate URL if (!validateURL(options.url)) { throw new Error('Invalid URL format'); } try { const encodedUrl = encodeURIComponent(options.url); const encodedQuery = encodeURIComponent(options.query); const response = await this.axiosClient.get(`/llm/${encodedUrl}?q=${encodedQuery}`); return response.data; } catch (error) { // Special handling for LLM-specific errors if (axios.isAxiosError(error)) { const axiosError = error as AxiosError; if (axiosError.code === 'ECONNABORTED' || axiosError.response?.status === 504) { throw new Error('LLM extraction timed out. Try a simpler query or different URL.'); } if (axiosError.response?.status === 401) { throw new Error( 'LLM extraction failed: No LLM provider configured on server. Please ensure the server has an API key set.', ); } } return handleAxiosError(error); } } } ```