This is page 3 of 4. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .env.example ├── .github │ ├── CI.md │ ├── copilot-instructions.md │ └── workflows │ └── ci.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── CHANGELOG.md ├── eslint.config.mjs ├── jest.config.cjs ├── jest.setup.cjs ├── LICENSE ├── package-lock.json ├── package.json ├── README.md ├── src │ ├── __tests__ │ │ ├── crawl.test.ts │ │ ├── crawl4ai-service.network.test.ts │ │ ├── crawl4ai-service.test.ts │ │ ├── handlers │ │ │ ├── crawl-handlers.test.ts │ │ │ ├── parameter-combinations.test.ts │ │ │ ├── screenshot-saving.test.ts │ │ │ ├── session-handlers.test.ts │ │ │ └── utility-handlers.test.ts │ │ ├── index.cli.test.ts │ │ ├── index.npx.test.ts │ │ ├── index.server.test.ts │ │ ├── index.test.ts │ │ ├── integration │ │ │ ├── batch-crawl.integration.test.ts │ │ │ ├── capture-screenshot.integration.test.ts │ │ │ ├── crawl-advanced.integration.test.ts │ │ │ ├── crawl-handlers.integration.test.ts │ │ │ ├── crawl-recursive.integration.test.ts │ │ │ ├── crawl.integration.test.ts │ │ │ ├── execute-js.integration.test.ts │ │ │ ├── extract-links.integration.test.ts │ │ │ ├── extract-with-llm.integration.test.ts │ │ │ ├── generate-pdf.integration.test.ts │ │ │ ├── get-html.integration.test.ts │ │ │ ├── get-markdown.integration.test.ts │ │ │ ├── parse-sitemap.integration.test.ts │ │ │ ├── session-management.integration.test.ts │ │ │ ├── smart-crawl.integration.test.ts │ │ │ └── test-utils.ts │ │ ├── request-handler.test.ts │ │ ├── schemas │ │ │ └── validation-edge-cases.test.ts │ │ ├── types │ │ │ └── mocks.ts │ │ └── utils │ │ └── javascript-validation.test.ts │ ├── crawl4ai-service.ts │ ├── handlers │ │ ├── base-handler.ts │ │ ├── content-handlers.ts │ │ ├── crawl-handlers.ts │ │ ├── session-handlers.ts │ │ └── utility-handlers.ts │ ├── index.ts │ ├── schemas │ │ ├── helpers.ts │ │ └── validation-schemas.ts │ ├── server.ts │ └── types.ts ├── tsconfig.build.json └── tsconfig.json ``` # Files -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- ```markdown 1 | # Changelog 2 | 3 | ## Version 3.0.2 (2025-09-01) 4 | 5 | ### Bug Fixes 6 | - Fixed manage_session tool schema compatibility with Claude/Anthropic tools 7 | - Removed oneOf/allOf/anyOf from top-level schema 8 | - Simplified to plain object schema with enum constraints 9 | - Maintains all functionality while improving MCP client compatibility 10 | 11 | ## Version 3.0.1 (2025-08-30) 12 | 13 | ### Documentation 14 | - Updated README.md to accurately document all new parameters from v3.0.0 15 | - Added documentation for batch_crawl configs array parameter 16 | - Clarified proxy object format support 17 | - Documented all new crawler parameters from Crawl4AI 0.7.3/0.7.4 18 | 19 | ## Version 3.0.0 (2025-08-30) 20 | 21 | ### Features 22 | - Added full support for Crawl4AI 0.7.3/0.7.4 features: 23 | - **'undetected' browser type** - Stealth browser option for anti-bot detection 24 | - **New crawler parameters**: 25 | - `delay_before_return_html` - Delay before returning HTML content 26 | - `css_selector` - Filter content by CSS selector 27 | - `include_links` - Include extracted links in response 28 | - `resolve_absolute_urls` - Convert relative URLs to absolute 29 | - **Extraction strategies** - Support for LLM extraction, table extraction, and markdown generation options 30 | - **Multi-config batch crawling** - Per-URL configurations in batch_crawl 31 | - **Unified proxy format** - Support both string and object proxy configurations 32 | - **Memory metrics display** - Show server memory usage when available 33 | 34 | ### Improvements 35 | - Enhanced error formatting for better debugging 36 | - Better handling of object error responses from API 37 | - Fixed batch_crawl to include required `urls` field when using configs array 38 | 39 | ### Testing 40 | - Added comprehensive integration tests for all new features 41 | - Fixed TypeScript errors in test files 42 | - All 306 unit tests passing 43 | - All 150 integration tests passing 44 | 45 | ### Backward Compatibility 46 | - Fully backward compatible with older Crawl4AI servers (before 0.7.4) 47 | - All new features are optional and gracefully degrade 48 | 49 | ## Version 2.9.0 (2025-08-29) 50 | 51 | ### Breaking Changes 52 | - Consolidated session management into single `manage_session` tool 53 | - Replaces `create_session`, `clear_session`, and `list_sessions` tools 54 | - Uses discriminated union with `action` parameter: 'create', 'clear', or 'list' 55 | - Reduces tool count from 15 to 13 56 | 57 | ### Removed 58 | - Removed `create_session` tool (use `manage_session` with `action: 'create'`) 59 | - Removed `clear_session` tool (use `manage_session` with `action: 'clear'`) 60 | - Removed `list_sessions` tool (use `manage_session` with `action: 'list'`) 61 | 62 | ### Improvements 63 | - Simplified API surface for better LLM interaction 64 | - Improved type safety with discriminated unions 65 | - Reduced code duplication in session management 66 | 67 | ### Testing 68 | - Updated all tests to use new `manage_session` tool 69 | - Maintained 100% test coverage 70 | 71 | ## Version 2.7.1 (2025-08-30) 72 | 73 | ### Bug Fixes 74 | - Fixed lint/formatting issues in test files 75 | - Cleaned up trailing whitespace 76 | 77 | ## Version 2.7.0 (2025-08-30) 78 | 79 | ### Compatibility Updates 80 | - Verified full compatibility with Crawl4AI version 0.7.4 81 | - All 15 MCP tools tested and working 82 | - 100% integration test pass rate (148 tests) 83 | - Supports new v0.7.3/0.7.4 features including: 84 | - Undetected browser support with stealth mode 85 | - Multi-URL configuration system 86 | - Enhanced table extraction 87 | - Memory optimization improvements 88 | 89 | ### Bug Fixes 90 | - Fixed unit test timeout issues in NPX and CLI tests 91 | - Added proper process cleanup and timeouts 92 | - Fixed edge case where dotenv was loading during tests 93 | - Ensured all spawned child processes are properly terminated 94 | 95 | ### Testing 96 | - Comprehensive testing against Crawl4AI v0.7.4 Docker image 97 | - All integration tests pass with LLM features enabled 98 | - Unit test suite: 308 tests passing 99 | - Integration test suite: 148 tests passing 100 | 101 | ## Version 2.6.12 (2025-08-05) 102 | 103 | ### Bug Fixes 104 | - Fixed server startup issue when running via npx 105 | - Removed complex module detection logic that was preventing server startup 106 | - Server now always starts when the script is executed (as intended for MCP servers) 107 | - Simplified dotenv loading to only attempt in development when env vars aren't set 108 | 109 | ## Version 2.6.11 (2025-08-05) 110 | 111 | ### Bug Fixes 112 | - Fixed environment variable handling when running via npx 113 | - Only loads .env file if CRAWL4AI_BASE_URL is not already set 114 | - Prevents issues when env vars are passed via CLI/MCP configuration 115 | - Ensures package works correctly with Claude Desktop and other MCP clients 116 | 117 | ## Version 2.6.10 (2025-08-05) 118 | 119 | ### Bug Fixes 120 | - Fixed unit tests to use correct localhost URL from jest.setup.cjs 121 | - Fixed network error handling tests to not specify request body in nock mocks 122 | - Unit tests always use http://localhost:11235 as configured 123 | - Integration tests get URL from .env file 124 | 125 | ### Code Quality 126 | - Replaced all 'any' type warnings with proper type assertions in tests 127 | - All tests passing with zero lint warnings 128 | 129 | ## Version 2.6.9 (2025-08-05) 130 | 131 | ### Testing Improvements 132 | - Improved crawl4ai-service.ts test coverage from 76% to 84% 133 | - Added comprehensive network error handling tests 134 | - Added URL validation tests for all service methods 135 | - Added tests for optional parameter handling 136 | - Added JavaScript validation edge case tests 137 | 138 | ### Code Quality 139 | - All tests pass with zero lint errors 140 | - Maintained 100% function coverage for service layer 141 | 142 | ## Version 2.6.8 (2025-08-05) 143 | 144 | ### Code Cleanup 145 | - Removed unused mock generation system 146 | - Cleaned up package.json scripts 147 | - Simplified development workflow 148 | 149 | ### Chores 150 | - Verified alignment between unit tests, integration tests, and implementation 151 | - Confirmed all tests properly mock API interactions 152 | 153 | ## Version 2.6.7 (2025-08-05) 154 | 155 | ### Bug Fixes 156 | - Fixed integration tests to use production Crawl4AI server from environment variables 157 | - Fixed child process environment variable loading in test utilities 158 | - Added support for both string and object markdown responses from Crawl4AI API 159 | - Fixed timeout issues in MHTML capture and HTML extraction tests 160 | - Replaced unreliable test URLs (httpbin.org) with stable alternatives 161 | - Added 30-second timeout to session creation to prevent socket hang-ups 162 | 163 | ### Testing Improvements 164 | - Integration tests now run sequentially (maxWorkers: 1) to avoid rate limiting 165 | - Added proper working directory configuration for child processes 166 | - Fixed all integration tests to pass with production API 167 | - Maintained test coverage at 92.25% with all tests passing 168 | 169 | ## Version 2.6.6 (2025-08-05) 170 | 171 | ### Testing 172 | - Improved test coverage from 88.8% to 93.19% 173 | - Added comprehensive CLI entry point tests for signal handling, environment variables, and dotenv loading 174 | - Added network failure tests for axios timeout and HTTP error scenarios 175 | - Added input validation edge case tests for JavaScript code validation 176 | - Added parameter combination tests for optional parameters and edge cases 177 | - Improved branch coverage from 80.76% to 86.12% 178 | - Improved function coverage from 96.41% to 98.92% 179 | 180 | ## Version 2.6.5 (2025-08-05) 181 | 182 | ### Features 183 | - Enhanced screenshot handling for better compatibility 184 | - Added home directory (`~`) path resolution support 185 | - Large screenshots (>800KB) are now saved locally without being returned inline to avoid MCP's 1MB response limit 186 | - Clear indication when screenshots are too large to display inline 187 | 188 | ### Bug Fixes 189 | - Improved screenshot directory handling 190 | - Better parameter descriptions clarifying that only directory paths should be provided 191 | - Added automatic handling when file paths are mistakenly provided instead of directories 192 | - Warning messages when incorrect path format is detected 193 | - Ensures compatibility with various LLM usage patterns 194 | 195 | ## Version 2.6.4 (2025-08-04) 196 | 197 | ### Features 198 | - Added local screenshot storage support 199 | - capture_screenshot: New save_to_directory parameter saves screenshots locally while returning as MCP resource 200 | - crawl: New screenshot_directory parameter saves screenshots when screenshot=true 201 | - Automatic filename generation using URL hostname and timestamp 202 | - Creates directories if they don't exist 203 | - Graceful error handling - failures don't interrupt the crawl operation 204 | - Added comprehensive unit tests for file saving functionality 205 | 206 | ## Version 2.6.3 (2025-08-04) 207 | 208 | ### Enhancements 209 | - Improved tool descriptions for better LLM understanding and workflow clarity 210 | - Added [STATELESS], [SUPPORTS SESSIONS], [SESSION MANAGEMENT] indicators 211 | - Enhanced get_html description to emphasize selector discovery for automation 212 | - Added inspect-first workflow patterns to crawl tool description 213 | - Emphasized element verification in js_code parameter description 214 | - Added typical workflow guidance to create_session 215 | - Improved cross-references between related tools 216 | - Removed problematic one-shot form pattern that assumed element existence 217 | 218 | ### Bug Fixes 219 | - Fixed crawl_recursive max_depth behavior 220 | - max_depth: 0 now correctly crawls only the initial page 221 | - Previously, max_depth: 0 would crawl pages at depth 0 and depth 1 222 | 223 | ## Version 2.6.2 (2025-08-04) 224 | 225 | ### Refactoring 226 | - Consolidated error handling in server.ts with validateAndExecute helper 227 | - Reduced ~90 lines of duplicate code 228 | - Preserved exact error message format for LLM compatibility 229 | - Improved maintainability while keeping behavior identical 230 | - Server.ts coverage improved from ~90% to 98.66% 231 | 232 | ## Version 2.6.1 (2025-08-04) 233 | 234 | ### Testing 235 | - Improved crawl-handlers test coverage from 87% to 97% 236 | - Added comprehensive unit tests for all crawl handler methods 237 | - Test error handling for batchCrawl, smartCrawl, crawlRecursive, parseSitemap 238 | - Cover edge cases including XML detection, URL validation, depth limits 239 | - Added integration tests for real API behavior validation 240 | - Test all crawl parameters including word_count_threshold, image thresholds, exclude_social_media_links 241 | - Properly handle MCP error formatting vs direct handler throws 242 | 243 | ## Version 2.6.0 (2025-08-04) 244 | 245 | ### Testing 246 | - Added comprehensive test coverage for error handling paths 247 | - Session creation with failed initial crawl 248 | - JavaScript execution error handling with accurate API response formats 249 | - Extract links manual extraction fallback when API returns empty links 250 | - Improved coverage from 87.23% to 89.71% lines 251 | - Added integration tests for crawl error handling 252 | - Invalid URL validation 253 | - Non-existent domain handling 254 | - Added unit tests for utility handlers 255 | - Manual link extraction from markdown 256 | - Malformed URL handling 257 | - Empty results scenarios 258 | 259 | ### Improvements 260 | - Better error resilience in session creation when initial crawl fails 261 | - More accurate test mocks based on real API responses 262 | 263 | ## Version 2.5.0 (2025-08-04) 264 | 265 | ### Refactoring 266 | - Removed backward compatibility exports from index.ts 267 | - Updated test imports to use direct module paths 268 | - Cleaned up index.ts to focus solely on CLI entry point 269 | 270 | ### Testing 271 | - Updated jest.setup.cjs to load .env for integration tests 272 | - Unit tests continue using localhost:11235 273 | - Integration tests now use values from .env file 274 | 275 | ## Version 2.4.0 (2025-08-04) 276 | 277 | ### Features 278 | - Replaced Codecov with GitHub Actions-based coverage badge 279 | - Coverage badge now uses GitHub Gist for storage 280 | - No external dependencies for coverage tracking 281 | - Badge updates automatically with each CI run 282 | - Coverage reports published to GitHub Pages 283 | - Interactive HTML coverage report available at https://omgwtfwow.github.io/mcp-crawl4ai-ts/coverage/ 284 | 285 | ### Bug Fixes 286 | - Fixed smart_crawl implementation to remove unsupported 'strategy' parameter 287 | - Fixed coverage extraction in CI to use lcov.info format 288 | - Added proper URL encoding for Shields.io endpoint badge 289 | 290 | ### CI/CD Improvements 291 | - Added GitHub Pages deployment for coverage reports 292 | - Added write permissions for GitHub Actions to create gh-pages branch 293 | - Removed Codecov integration completely 294 | 295 | ### Maintenance 296 | - Removed .codecov.yml configuration file 297 | - Removed CODECOV_TOKEN from repository secrets 298 | - Updated README.md with new coverage badge 299 | 300 | ## Version 2.3.0 (2025-08-03) 301 | 302 | ### Refactoring 303 | - Split large 2,366-line index.ts file into modular structure 304 | - Created handlers/ directory with operation-specific handlers 305 | - Created schemas/ directory for validation schemas 306 | - Reduced file sizes to under 1,000 lines each (most under 300) 307 | - Maintained backward compatibility with all exports 308 | - Improved code organization and maintainability 309 | 310 | ### Testing 311 | - Updated tests to work with new modular structure 312 | - Maintained test coverage at 87.23% (exceeds 86% requirement) 313 | - All 165 unit tests passing 314 | 315 | ## Version 2.2.0 (2025-08-03) 316 | 317 | ### Features 318 | - Added comprehensive test coverage infrastructure 319 | - Set up Jest code coverage with Istanbul 320 | - Added test:coverage and test:ci npm scripts 321 | - Configured coverage thresholds (80% for all metrics) 322 | - Added coverage badge to README 323 | - Achieved 86.51% line coverage, 82.21% statement coverage 324 | 325 | ### Testing Improvements 326 | - Added comprehensive unit tests for all tool handlers in index.ts 327 | - Tests for success cases, error handling, and edge cases 328 | - Tests for MCP protocol request handling 329 | - Tests for parameter validation with Zod schemas 330 | - Added unit tests for JavaScript validation function 331 | - Added tests for private methods: parseSitemap and detectContentType 332 | - Fixed integration test reliability issues: 333 | - Replaced example.com with httpbin.org in execute-js tests 334 | - Fixed test expectations for JavaScript execution results 335 | - Fixed MCP request handler test setup 336 | 337 | ### Bug Fixes 338 | - Fixed parse_sitemap implementation to use axios.get directly instead of non-existent service method 339 | - Fixed TypeScript 'any' warnings in test files (eliminated 90+ warnings) 340 | - Fixed linting errors and formatting issues across the test suite 341 | - Fixed test URL in batch-crawl test (httpbingo.org → httpbin.org) 342 | 343 | ### CI/CD Improvements 344 | - Updated GitHub Actions workflow to include coverage reporting 345 | - Added Node.js 22.x to the test matrix 346 | - Fixed all failing CI tests 347 | 348 | ## Version 2.1.2 (2025-08-03) 349 | 350 | ### Documentation 351 | - Updated Node.js requirement from 16+ to 18+ to reflect actual testing and support 352 | - Node.js 16 reached End-of-Life in September 2023 353 | - CI only tests on Node.js 18.x and 20.x 354 | - Added `engines` field to package.json to enforce Node.js 18+ requirement 355 | 356 | ## Version 2.1.1 (2025-08-03) 357 | 358 | ### Bug Fixes 359 | - Fixed GitHub homepage README display issue by renaming .github/README.md to CI.md 360 | - GitHub was showing the CI documentation instead of the main project README 361 | 362 | ## Version 2.1.0 (2025-08-03) 363 | 364 | ### Bug Fixes 365 | - Fixed `smart_crawl` bug where markdown object was incorrectly printed as `[object Object]` 366 | - Now correctly accesses `result.markdown.raw_markdown` for content display 367 | - Fixed integration test timeout issues: 368 | - Replaced example.com with httpbin.org/html in tests to avoid "domcontentloaded" timeout issues 369 | - Fixed httpbin.org URLs by adding proper path suffixes (e.g., /links/5/0) 370 | - Limited Jest parallelization for integration tests to prevent server overload 371 | - Fixed parameter mapping in `get_markdown` tool - now correctly maps schema properties (`filter`, `query`, `cache`) to API parameters (`f`, `q`, `c`) 372 | - Fixed `smart_crawl` schema to use `follow_links` parameter instead of `remove_images` 373 | - Fixed `extract_links` schema mismatch - corrected schema to use `categorize` parameter as defined in tool 374 | - Fixed `extract_links` implementation to properly handle link objects returned by API 375 | - Fixed `crawl_recursive` schema mismatch - corrected schema to use `include_pattern` and `exclude_pattern` instead of `filter_pattern` and `bypass_cache` 376 | - Fixed `crawl_recursive` implementation to use `/crawl` endpoint instead of `/md` for proper link extraction 377 | - Fixed `crawl_recursive` type issues and improved link handling for recursive crawling 378 | - Fixed `parse_sitemap` implementation to fetch sitemaps directly instead of through Crawl4AI server API 379 | - Fixed `create_session` schema to make `session_id` optional as documented 380 | - Enhanced `create_session` response to include all session parameters for programmatic access 381 | - Implemented proper handling for non-functional server parameters: 382 | - `batch_crawl`: `remove_images` now uses `exclude_tags` in crawler_config to actually remove images 383 | - `smart_crawl`: `follow_links` now crawls URLs found in sitemaps/RSS feeds (max 10 URLs) 384 | - Fixed `crawl` and `generate_pdf` tools PDF response to use proper MCP SDK embedded resource format with blob field 385 | 386 | ### Improvements 387 | - Added comprehensive integration tests for `batch_crawl` tool (7 tests) 388 | - Added comprehensive integration tests for `smart_crawl` tool (8 tests) 389 | - Fixed all ESLint formatting issues across the codebase 390 | - Enhanced error handling for empty URL arrays in batch_crawl 391 | - Improved test reliability by replacing problematic test URLs 392 | - Updated tool descriptions to accurately reflect actual behavior 393 | - Added proper TypeScript types for getMarkdown function 394 | - Enhanced test coverage for batch_crawl parameter handling 395 | - Added comprehensive unit and integration tests for `extract_links` tool 396 | - Improved JSON endpoint detection in `extract_links` tool 397 | - Better error handling for `extract_links` with graceful error messages 398 | - Added comprehensive integration tests for `crawl_recursive` tool 399 | - Improved `crawl_recursive` output format to clearly show depth levels and internal link counts 400 | - Enhanced error handling in `crawl_recursive` to continue crawling even if individual pages fail 401 | - Added comprehensive integration tests for `parse_sitemap` tool with various test cases 402 | - Added comprehensive integration tests for session management tools (`create_session`, `clear_session`, `list_sessions`) 403 | - Enhanced integration tests for `extract_with_llm` tool to handle non-deterministic LLM responses 404 | - Installed nock library for future HTTP mocking in unit tests 405 | - Fixed TypeScript lint warnings by replacing `any` types with proper types: 406 | - Changed error handling to use proper type assertions 407 | - Updated `unknown[]` for JavaScript execution results 408 | - Used `Record<string, unknown>` for generic objects 409 | - Created `LinkItem` interface for better type safety 410 | - Fixed all production code `any` types 411 | - Removed unused legacy `CrawlResult` interface 412 | - Consolidated unit tests to use nock for HTTP mocking: 413 | - Removed redundant Jest mock test file 414 | - Removed unused mocks directory 415 | - Renamed test file for clarity 416 | - Improved unit test performance from 92s to ~1s by removing timeout tests 417 | - Cleaned up test organization and removed test README 418 | - Added GitHub Actions CI workflow: 419 | - Automatic testing on push to main and pull requests 420 | - Tests run on Node.js 18.x and 20.x 421 | - Includes linting, formatting checks, and build verification 422 | - Added mock helper scripts: 423 | - `npm run generate-mocks`: Generate nock mock code from real API 424 | - `npm run view-mocks`: View and save API responses for reference 425 | - Both scripts help maintain accurate test mocks 426 | 427 | ## Version 2.0.1 (2025-08-02) 428 | Update README 429 | 430 | ## Version 2.0.0 (2025-08-02) 431 | 432 | ### Breaking Changes 433 | - Renamed `crawl_with_config` tool to `crawl` 434 | 435 | ### New Features 436 | - Added comprehensive response types for all endpoints (PDF, screenshot, HTML, markdown) 437 | - Enhanced parameter validation with clearer error messages 438 | - Improved documentation for JavaScript execution patterns 439 | - Added selector strategy guidance for form interaction 440 | - Better distinction between `wait_for` and `wait_until` usage 441 | 442 | ### Bug Fixes 443 | - Fixed server 500 errors by always including `crawler_config` in requests 444 | - Updated media and links types to match actual server responses 445 | - Corrected validation for `js_only` parameter usage 446 | 447 | ### Documentation 448 | - Added troubleshooting section with common issues and solutions 449 | - Included practical examples for form filling and multi-step navigation 450 | - Enhanced tool descriptions with clear warnings and recommendations 451 | - Added selector strategy guide for working with dynamic content 452 | 453 | ### Technical Improvements 454 | - Updated all TypeScript types based on actual server responses 455 | - Improved error handling and user-friendly messages 456 | - Enhanced Zod validation schemas with helpful refinements 457 | - Added comprehensive integration tests for new features 458 | 459 | ### Known Issues 460 | - `js_only: true` causes server serialization errors - use `screenshot: true` as workaround 461 | - Using `wait_for` with elements that already exist can cause timeouts - use `wait_until` instead 462 | 463 | ## Version 1.0.2 464 | - Initial stable release with full MCP implementation 465 | - Support for all Crawl4AI endpoints 466 | - Basic session management 467 | - Integration with MCP clients ``` -------------------------------------------------------------------------------- /src/handlers/crawl-handlers.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { BaseHandler } from './base-handler.js'; 2 | import { 3 | BatchCrawlOptions, 4 | CrawlResultItem, 5 | AdvancedCrawlConfig, 6 | CrawlEndpointResponse, 7 | ExtractionStrategy, 8 | TableExtractionStrategy, 9 | MarkdownGeneratorOptions, 10 | } from '../types.js'; 11 | import * as fs from 'fs/promises'; 12 | import * as path from 'path'; 13 | import * as os from 'os'; 14 | 15 | export class CrawlHandlers extends BaseHandler { 16 | async batchCrawl(options: BatchCrawlOptions) { 17 | try { 18 | let response; 19 | 20 | // Check if we have per-URL configs (new in 0.7.3/0.7.4) 21 | if (options.configs && options.configs.length > 0) { 22 | // Use the new configs array format 23 | // Extract URLs from configs for the urls field 24 | const urls = options.configs.map((config) => config.url); 25 | const requestBody = { 26 | urls: urls, 27 | configs: options.configs, 28 | max_concurrent: options.max_concurrent, 29 | }; 30 | response = await this.axiosClient.post('/crawl', requestBody); 31 | } else { 32 | // Use the legacy format with single crawler_config 33 | // Build crawler config if needed 34 | const crawler_config: Record<string, unknown> = {}; 35 | 36 | // Handle remove_images by using exclude_tags 37 | if (options.remove_images) { 38 | crawler_config.exclude_tags = ['img', 'picture', 'svg']; 39 | } 40 | 41 | if (options.bypass_cache) { 42 | crawler_config.cache_mode = 'BYPASS'; 43 | } 44 | 45 | response = await this.axiosClient.post('/crawl', { 46 | urls: options.urls, 47 | max_concurrent: options.max_concurrent, 48 | crawler_config: Object.keys(crawler_config).length > 0 ? crawler_config : undefined, 49 | }); 50 | } 51 | 52 | const results = response.data.results || []; 53 | 54 | // Add memory metrics if available 55 | let metricsText = ''; 56 | const responseData = response.data as CrawlEndpointResponse; 57 | if (responseData.server_memory_delta_mb !== undefined || responseData.server_peak_memory_mb !== undefined) { 58 | const memoryInfo = []; 59 | if (responseData.server_processing_time_s !== undefined) { 60 | memoryInfo.push(`Processing time: ${responseData.server_processing_time_s.toFixed(2)}s`); 61 | } 62 | if (responseData.server_memory_delta_mb !== undefined) { 63 | memoryInfo.push(`Memory delta: ${responseData.server_memory_delta_mb.toFixed(1)}MB`); 64 | } 65 | if (responseData.server_peak_memory_mb !== undefined) { 66 | memoryInfo.push(`Peak memory: ${responseData.server_peak_memory_mb.toFixed(1)}MB`); 67 | } 68 | if (memoryInfo.length > 0) { 69 | metricsText = `\n\nServer metrics: ${memoryInfo.join(', ')}`; 70 | } 71 | } 72 | 73 | return { 74 | content: [ 75 | { 76 | type: 'text', 77 | text: `Batch crawl completed. Processed ${results.length} URLs:\n\n${results 78 | .map( 79 | (r: CrawlResultItem, i: number) => `${i + 1}. ${options.urls[i]}: ${r.success ? 'Success' : 'Failed'}`, 80 | ) 81 | .join('\n')}${metricsText}`, 82 | }, 83 | ], 84 | }; 85 | } catch (error) { 86 | throw this.formatError(error, 'batch crawl'); 87 | } 88 | } 89 | 90 | async smartCrawl(options: { url: string; max_depth?: number; follow_links?: boolean; bypass_cache?: boolean }) { 91 | try { 92 | // First, try to detect the content type from URL or HEAD request 93 | let contentType = ''; 94 | try { 95 | const headResponse = await this.axiosClient.head(options.url); 96 | contentType = headResponse.headers['content-type'] || ''; 97 | } catch { 98 | // If HEAD request fails, continue anyway - we'll detect from the crawl response 99 | console.debug('HEAD request failed, will detect content type from response'); 100 | } 101 | 102 | let detectedType = 'html'; 103 | if (options.url.includes('sitemap') || options.url.endsWith('.xml')) { 104 | detectedType = 'sitemap'; 105 | } else if (options.url.includes('rss') || options.url.includes('feed')) { 106 | detectedType = 'rss'; 107 | } else if (contentType.includes('text/plain') || options.url.endsWith('.txt')) { 108 | detectedType = 'text'; 109 | } else if (contentType.includes('application/xml') || contentType.includes('text/xml')) { 110 | detectedType = 'xml'; 111 | } else if (contentType.includes('application/json')) { 112 | detectedType = 'json'; 113 | } 114 | 115 | // Crawl without the unsupported 'strategy' parameter 116 | const response = await this.axiosClient.post('/crawl', { 117 | urls: [options.url], 118 | crawler_config: { 119 | cache_mode: options.bypass_cache ? 'BYPASS' : 'ENABLED', 120 | }, 121 | browser_config: { 122 | headless: true, 123 | browser_type: 'chromium', 124 | }, 125 | }); 126 | 127 | const results = response.data.results || []; 128 | const result = results[0] || {}; 129 | 130 | // Handle follow_links for sitemaps and RSS feeds 131 | if (options.follow_links && (detectedType === 'sitemap' || detectedType === 'rss' || detectedType === 'xml')) { 132 | // Extract URLs from the content 133 | const urlPattern = /<loc>(.*?)<\/loc>|<link[^>]*>(.*?)<\/link>|href=["']([^"']+)["']/gi; 134 | const content = result.markdown || result.html || ''; 135 | const foundUrls: string[] = []; 136 | let match; 137 | 138 | while ((match = urlPattern.exec(content)) !== null) { 139 | const url = match[1] || match[2] || match[3]; 140 | if (url && url.startsWith('http')) { 141 | foundUrls.push(url); 142 | } 143 | } 144 | 145 | if (foundUrls.length > 0) { 146 | // Limit to first 10 URLs to avoid overwhelming the system 147 | const urlsToFollow = foundUrls.slice(0, Math.min(10, options.max_depth || 10)); 148 | 149 | // Crawl the found URLs 150 | await this.axiosClient.post('/crawl', { 151 | urls: urlsToFollow, 152 | max_concurrent: 3, 153 | bypass_cache: options.bypass_cache, 154 | }); 155 | 156 | return { 157 | content: [ 158 | { 159 | type: 'text', 160 | text: `Smart crawl detected content type: ${detectedType}\n\nMain content:\n${result.markdown?.raw_markdown || result.html || 'No content extracted'}\n\n---\nFollowed ${urlsToFollow.length} links:\n${urlsToFollow.map((url, i) => `${i + 1}. ${url}`).join('\n')}`, 161 | }, 162 | ...(result.metadata 163 | ? [ 164 | { 165 | type: 'text', 166 | text: `\n\n---\nMetadata:\n${JSON.stringify(result.metadata, null, 2)}`, 167 | }, 168 | ] 169 | : []), 170 | ], 171 | }; 172 | } 173 | } 174 | 175 | return { 176 | content: [ 177 | { 178 | type: 'text', 179 | text: `Smart crawl detected content type: ${detectedType}\n\n${result.markdown?.raw_markdown || result.html || 'No content extracted'}`, 180 | }, 181 | ...(result.metadata 182 | ? [ 183 | { 184 | type: 'text', 185 | text: `\n\n---\nMetadata:\n${JSON.stringify(result.metadata, null, 2)}`, 186 | }, 187 | ] 188 | : []), 189 | ], 190 | }; 191 | } catch (error) { 192 | throw this.formatError(error, 'smart crawl'); 193 | } 194 | } 195 | 196 | async crawlRecursive(options: { 197 | url: string; 198 | max_depth?: number; 199 | max_pages?: number; 200 | include_pattern?: string; 201 | exclude_pattern?: string; 202 | }) { 203 | try { 204 | const startUrl = new URL(options.url); 205 | const visited = new Set<string>(); 206 | const toVisit: Array<{ url: string; depth: number }> = [{ url: options.url, depth: 0 }]; 207 | const results: Array<{ url: string; content: string; internal_links_found: number; depth: number }> = []; 208 | let maxDepthReached = 0; 209 | 210 | const includeRegex = options.include_pattern ? new RegExp(options.include_pattern) : null; 211 | const excludeRegex = options.exclude_pattern ? new RegExp(options.exclude_pattern) : null; 212 | 213 | const maxDepth = options.max_depth !== undefined ? options.max_depth : 3; 214 | const maxPages = options.max_pages || 50; 215 | 216 | while (toVisit.length > 0 && results.length < maxPages) { 217 | const current = toVisit.shift(); 218 | if (!current || visited.has(current.url) || current.depth > maxDepth) { 219 | continue; 220 | } 221 | 222 | visited.add(current.url); 223 | 224 | try { 225 | // Check URL patterns 226 | if (excludeRegex && excludeRegex.test(current.url)) continue; 227 | if (includeRegex && !includeRegex.test(current.url)) continue; 228 | 229 | // Crawl the page using the crawl endpoint to get links 230 | const response = await this.axiosClient.post('/crawl', { 231 | urls: [current.url], 232 | crawler_config: { 233 | cache_mode: 'BYPASS', 234 | }, 235 | }); 236 | 237 | const crawlResults = response.data.results || [response.data]; 238 | const result: CrawlResultItem = crawlResults[0]; 239 | 240 | if (result && result.success) { 241 | const markdownContent = result.markdown?.fit_markdown || result.markdown?.raw_markdown || ''; 242 | const internalLinksCount = result.links?.internal?.length || 0; 243 | maxDepthReached = Math.max(maxDepthReached, current.depth); 244 | results.push({ 245 | url: current.url, 246 | content: markdownContent, 247 | internal_links_found: internalLinksCount, 248 | depth: current.depth, 249 | }); 250 | 251 | // Add internal links to crawl queue 252 | if (current.depth < maxDepth && result.links?.internal) { 253 | for (const linkObj of result.links.internal) { 254 | const linkUrl = linkObj.href || linkObj; 255 | try { 256 | const absoluteUrl = new URL(linkUrl, current.url).toString(); 257 | if (!visited.has(absoluteUrl) && new URL(absoluteUrl).hostname === startUrl.hostname) { 258 | toVisit.push({ url: absoluteUrl, depth: current.depth + 1 }); 259 | } 260 | } catch (e) { 261 | // Skip invalid URLs 262 | console.debug('Invalid URL:', e); 263 | } 264 | } 265 | } 266 | } 267 | } catch (error) { 268 | // Log but continue crawling other pages 269 | console.error(`Failed to crawl ${current.url}:`, error instanceof Error ? error.message : error); 270 | } 271 | } 272 | 273 | // Prepare the output text 274 | let outputText = `Recursive crawl completed:\n\nPages crawled: ${results.length}\nStarting URL: ${options.url}\n`; 275 | 276 | if (results.length > 0) { 277 | outputText += `Max depth reached: ${maxDepthReached} (limit: ${maxDepth})\n\nNote: Only internal links (same domain) are followed during recursive crawling.\n\nPages found:\n${results.map((r) => `- [Depth ${r.depth}] ${r.url}\n Content: ${r.content.length} chars\n Internal links found: ${r.internal_links_found}`).join('\n')}`; 278 | } else { 279 | outputText += `\nNo pages could be crawled. This might be due to:\n- The starting URL returned an error\n- No internal links were found\n- All discovered links were filtered out by include/exclude patterns`; 280 | } 281 | 282 | return { 283 | content: [ 284 | { 285 | type: 'text', 286 | text: outputText, 287 | }, 288 | ], 289 | }; 290 | } catch (error) { 291 | throw this.formatError(error, 'crawl recursively'); 292 | } 293 | } 294 | 295 | async parseSitemap(options: { url: string; filter_pattern?: string }) { 296 | try { 297 | // Fetch the sitemap directly (not through Crawl4AI server) 298 | const axios = (await import('axios')).default; 299 | const response = await axios.get(options.url, { 300 | timeout: 30000, 301 | headers: { 302 | 'User-Agent': 'Mozilla/5.0 (compatible; MCP-Crawl4AI/1.0)', 303 | }, 304 | }); 305 | const sitemapContent = response.data; 306 | 307 | // Parse XML content - simple regex approach for basic sitemaps 308 | const urlMatches = sitemapContent.match(/<loc>(.*?)<\/loc>/g) || []; 309 | const urls = urlMatches.map((match: string) => match.replace(/<\/?loc>/g, '')); 310 | 311 | // Apply filter if provided 312 | let filteredUrls = urls; 313 | if (options.filter_pattern) { 314 | const filterRegex = new RegExp(options.filter_pattern); 315 | filteredUrls = urls.filter((url: string) => filterRegex.test(url)); 316 | } 317 | 318 | return { 319 | content: [ 320 | { 321 | type: 'text', 322 | text: `Sitemap parsed successfully:\n\nTotal URLs found: ${urls.length}\nFiltered URLs: ${filteredUrls.length}\n\nURLs:\n${filteredUrls.slice(0, 100).join('\n')}${filteredUrls.length > 100 ? '\n... and ' + (filteredUrls.length - 100) + ' more' : ''}`, 323 | }, 324 | ], 325 | }; 326 | } catch (error) { 327 | throw this.formatError(error, 'parse sitemap'); 328 | } 329 | } 330 | 331 | async crawl(options: Record<string, unknown>) { 332 | try { 333 | // Ensure options is an object 334 | if (!options || typeof options !== 'object') { 335 | throw new Error('crawl requires options object with at least a url parameter'); 336 | } 337 | 338 | // Build browser_config 339 | const browser_config: Record<string, unknown> = { 340 | headless: true, // Always true as noted 341 | }; 342 | 343 | if (options.browser_type) browser_config.browser_type = options.browser_type; 344 | if (options.viewport_width) browser_config.viewport_width = options.viewport_width; 345 | if (options.viewport_height) browser_config.viewport_height = options.viewport_height; 346 | if (options.user_agent) browser_config.user_agent = options.user_agent; 347 | if (options.headers) browser_config.headers = options.headers; 348 | if (options.cookies) browser_config.cookies = options.cookies; 349 | 350 | // Handle proxy configuration - support both unified and legacy formats 351 | if (options.proxy) { 352 | // New unified format (0.7.3/0.7.4) 353 | browser_config.proxy = options.proxy; 354 | } else if (options.proxy_server) { 355 | // Legacy format for backward compatibility 356 | browser_config.proxy_config = { 357 | server: options.proxy_server, 358 | username: options.proxy_username, 359 | password: options.proxy_password, 360 | }; 361 | } 362 | 363 | // Build crawler_config 364 | const crawler_config: Record<string, unknown> = {}; 365 | 366 | // Content filtering 367 | if (options.word_count_threshold !== undefined) 368 | crawler_config.word_count_threshold = options.word_count_threshold; 369 | if (options.excluded_tags) crawler_config.excluded_tags = options.excluded_tags; 370 | if (options.remove_overlay_elements) crawler_config.remove_overlay_elements = options.remove_overlay_elements; 371 | 372 | // JavaScript execution 373 | if (options.js_code !== undefined && options.js_code !== null) { 374 | // If js_code is an array, join it with newlines for the server 375 | crawler_config.js_code = Array.isArray(options.js_code) ? options.js_code.join('\n') : options.js_code; 376 | } else if (options.js_code === null) { 377 | // If js_code is explicitly null, throw a helpful error 378 | throw new Error('js_code parameter is null. Please provide JavaScript code as a string or array of strings.'); 379 | } 380 | if (options.wait_for) crawler_config.wait_for = options.wait_for; 381 | if (options.wait_for_timeout) crawler_config.wait_for_timeout = options.wait_for_timeout; 382 | 383 | // Dynamic content 384 | if (options.delay_before_scroll) crawler_config.delay_before_scroll = options.delay_before_scroll; 385 | if (options.scroll_delay) crawler_config.scroll_delay = options.scroll_delay; 386 | 387 | // Content processing 388 | if (options.process_iframes) crawler_config.process_iframes = options.process_iframes; 389 | if (options.exclude_external_links) crawler_config.exclude_external_links = options.exclude_external_links; 390 | 391 | // Export options 392 | if (options.screenshot) crawler_config.screenshot = options.screenshot; 393 | if (options.pdf) crawler_config.pdf = options.pdf; 394 | 395 | // Session and cache 396 | if (options.session_id) { 397 | crawler_config.session_id = options.session_id; 398 | // Update session last_used time 399 | const session = this.sessions.get(String(options.session_id)); 400 | if (session) { 401 | session.last_used = new Date(); 402 | } 403 | } 404 | if (options.cache_mode) crawler_config.cache_mode = String(options.cache_mode).toLowerCase(); 405 | 406 | // Performance 407 | if (options.timeout) crawler_config.timeout = options.timeout; 408 | if (options.verbose) crawler_config.verbose = options.verbose; 409 | 410 | // Additional crawler parameters 411 | if (options.wait_until) crawler_config.wait_until = options.wait_until; 412 | if (options.page_timeout) crawler_config.page_timeout = options.page_timeout; 413 | if (options.wait_for_images) crawler_config.wait_for_images = options.wait_for_images; 414 | if (options.ignore_body_visibility) crawler_config.ignore_body_visibility = options.ignore_body_visibility; 415 | if (options.scan_full_page) crawler_config.scan_full_page = options.scan_full_page; 416 | if (options.remove_forms) crawler_config.remove_forms = options.remove_forms; 417 | if (options.keep_data_attributes) crawler_config.keep_data_attributes = options.keep_data_attributes; 418 | if (options.excluded_selector) crawler_config.excluded_selector = options.excluded_selector; 419 | if (options.only_text) crawler_config.only_text = options.only_text; 420 | 421 | // Media handling 422 | if (options.image_description_min_word_threshold !== undefined) 423 | crawler_config.image_description_min_word_threshold = options.image_description_min_word_threshold; 424 | if (options.image_score_threshold !== undefined) 425 | crawler_config.image_score_threshold = options.image_score_threshold; 426 | if (options.exclude_external_images) crawler_config.exclude_external_images = options.exclude_external_images; 427 | if (options.screenshot_wait_for !== undefined) crawler_config.screenshot_wait_for = options.screenshot_wait_for; 428 | 429 | // Link filtering 430 | if (options.exclude_social_media_links) 431 | crawler_config.exclude_social_media_links = options.exclude_social_media_links; 432 | if (options.exclude_domains) crawler_config.exclude_domains = options.exclude_domains; 433 | 434 | // Page interaction 435 | if (options.js_only) crawler_config.js_only = options.js_only; 436 | if (options.simulate_user) crawler_config.simulate_user = options.simulate_user; 437 | if (options.override_navigator) crawler_config.override_navigator = options.override_navigator; 438 | if (options.magic) crawler_config.magic = options.magic; 439 | 440 | // Virtual scroll 441 | if (options.virtual_scroll_config) crawler_config.virtual_scroll_config = options.virtual_scroll_config; 442 | 443 | // Cache control 444 | if (options.cache_mode) crawler_config.cache_mode = options.cache_mode; 445 | 446 | // Other 447 | if (options.log_console) crawler_config.log_console = options.log_console; 448 | if (options.capture_mhtml) crawler_config.capture_mhtml = options.capture_mhtml; 449 | 450 | // New parameters from 0.7.3/0.7.4 451 | if (options.delay_before_return_html) crawler_config.delay_before_return_html = options.delay_before_return_html; 452 | if (options.css_selector) crawler_config.css_selector = options.css_selector; 453 | if (options.include_links !== undefined) crawler_config.include_links = options.include_links; 454 | if (options.resolve_absolute_urls !== undefined) 455 | crawler_config.resolve_absolute_urls = options.resolve_absolute_urls; 456 | 457 | // Call service with proper configuration 458 | const crawlConfig: AdvancedCrawlConfig = { 459 | url: options.url ? String(options.url) : undefined, 460 | crawler_config, 461 | }; 462 | 463 | // Add extraction strategy passthrough objects if provided 464 | if (options.extraction_strategy) 465 | crawlConfig.extraction_strategy = options.extraction_strategy as ExtractionStrategy; 466 | if (options.table_extraction_strategy) 467 | crawlConfig.table_extraction_strategy = options.table_extraction_strategy as TableExtractionStrategy; 468 | if (options.markdown_generator_options) 469 | crawlConfig.markdown_generator_options = options.markdown_generator_options as MarkdownGeneratorOptions; 470 | 471 | // Only include browser_config if we're not using a session 472 | if (!options.session_id) { 473 | crawlConfig.browser_config = browser_config; 474 | } 475 | 476 | const response: CrawlEndpointResponse = await this.service.crawl(crawlConfig); 477 | 478 | // Validate response structure 479 | if (!response || !response.results || response.results.length === 0) { 480 | throw new Error('Invalid response from server: no results received'); 481 | } 482 | 483 | const result: CrawlResultItem = response.results[0]; 484 | 485 | // Build response content 486 | const content = []; 487 | 488 | // Main content - use markdown.raw_markdown as primary content 489 | let mainContent = 'No content extracted'; 490 | 491 | if (result.extracted_content) { 492 | // Handle extraction results which might be objects or strings 493 | if (typeof result.extracted_content === 'string') { 494 | mainContent = result.extracted_content; 495 | } else if (typeof result.extracted_content === 'object') { 496 | mainContent = JSON.stringify(result.extracted_content, null, 2); 497 | } 498 | } else if (result.markdown?.raw_markdown) { 499 | mainContent = result.markdown.raw_markdown; 500 | } else if (result.html) { 501 | mainContent = result.html; 502 | } else if (result.fit_html) { 503 | mainContent = result.fit_html; 504 | } 505 | 506 | content.push({ 507 | type: 'text', 508 | text: mainContent, 509 | }); 510 | 511 | // Screenshot if available 512 | if (result.screenshot) { 513 | // Save to local directory if requested 514 | let savedFilePath: string | undefined; 515 | if (options.screenshot_directory && typeof options.screenshot_directory === 'string') { 516 | try { 517 | // Resolve home directory path 518 | let screenshotDir = options.screenshot_directory; 519 | if (screenshotDir.startsWith('~')) { 520 | const homedir = os.homedir(); 521 | screenshotDir = path.join(homedir, screenshotDir.slice(1)); 522 | } 523 | 524 | // Check if user provided a file path instead of directory 525 | if (screenshotDir.endsWith('.png') || screenshotDir.endsWith('.jpg')) { 526 | console.warn( 527 | `Warning: screenshot_directory should be a directory path, not a file path. Using parent directory.`, 528 | ); 529 | screenshotDir = path.dirname(screenshotDir); 530 | } 531 | 532 | // Ensure directory exists 533 | await fs.mkdir(screenshotDir, { recursive: true }); 534 | 535 | // Generate filename from URL and timestamp 536 | const url = new URL(String(options.url)); 537 | const hostname = url.hostname.replace(/[^a-z0-9]/gi, '-'); 538 | const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, -5); 539 | const filename = `${hostname}-${timestamp}.png`; 540 | 541 | savedFilePath = path.join(screenshotDir, filename); 542 | 543 | // Convert base64 to buffer and save 544 | const buffer = Buffer.from(result.screenshot, 'base64'); 545 | await fs.writeFile(savedFilePath, buffer); 546 | } catch (saveError) { 547 | // Log error but don't fail the operation 548 | console.error('Failed to save screenshot locally:', saveError); 549 | } 550 | } 551 | 552 | // If saved locally and screenshot is large (>800KB), don't return the base64 data 553 | const screenshotSize = Buffer.from(result.screenshot, 'base64').length; 554 | const shouldReturnImage = !savedFilePath || screenshotSize < 800 * 1024; // 800KB threshold 555 | 556 | if (shouldReturnImage) { 557 | content.push({ 558 | type: 'image', 559 | data: result.screenshot, 560 | mimeType: 'image/png', 561 | }); 562 | } 563 | 564 | if (savedFilePath) { 565 | const sizeInfo = !shouldReturnImage 566 | ? ` (${Math.round(screenshotSize / 1024)}KB - too large to display inline)` 567 | : ''; 568 | content.push({ 569 | type: 'text', 570 | text: `\n---\nScreenshot saved to: ${savedFilePath}${sizeInfo}`, 571 | }); 572 | } 573 | } 574 | 575 | // PDF if available 576 | if (result.pdf) { 577 | content.push({ 578 | type: 'resource', 579 | resource: { 580 | uri: `data:application/pdf;name=${encodeURIComponent(new URL(String(options.url)).hostname)}.pdf;base64,${result.pdf}`, 581 | mimeType: 'application/pdf', 582 | blob: result.pdf, 583 | }, 584 | }); 585 | } 586 | 587 | // Metadata 588 | if (result.metadata) { 589 | content.push({ 590 | type: 'text', 591 | text: `\n---\nMetadata: ${JSON.stringify(result.metadata, null, 2)}`, 592 | }); 593 | } 594 | 595 | // Links 596 | if (result.links && (result.links.internal.length > 0 || result.links.external.length > 0)) { 597 | content.push({ 598 | type: 'text', 599 | text: `\n---\nLinks: Internal: ${result.links.internal.length}, External: ${result.links.external.length}`, 600 | }); 601 | } 602 | 603 | // JS execution results if available 604 | if (result.js_execution_result && result.js_execution_result.results.length > 0) { 605 | const jsResults = result.js_execution_result.results 606 | .map((res: unknown, idx: number) => { 607 | return `Result ${idx + 1}: ${JSON.stringify(res, null, 2)}`; 608 | }) 609 | .join('\n'); 610 | content.push({ 611 | type: 'text', 612 | text: `\n---\nJavaScript Execution Results:\n${jsResults}`, 613 | }); 614 | } 615 | 616 | // Add memory metrics if available 617 | if (response.server_memory_delta_mb !== undefined || response.server_peak_memory_mb !== undefined) { 618 | const memoryInfo = []; 619 | if (response.server_processing_time_s !== undefined) { 620 | memoryInfo.push(`Processing time: ${response.server_processing_time_s.toFixed(2)}s`); 621 | } 622 | if (response.server_memory_delta_mb !== undefined) { 623 | memoryInfo.push(`Memory delta: ${response.server_memory_delta_mb.toFixed(1)}MB`); 624 | } 625 | if (response.server_peak_memory_mb !== undefined) { 626 | memoryInfo.push(`Peak memory: ${response.server_peak_memory_mb.toFixed(1)}MB`); 627 | } 628 | if (memoryInfo.length > 0) { 629 | content.push({ 630 | type: 'text', 631 | text: `\n---\nServer metrics: ${memoryInfo.join(', ')}`, 632 | }); 633 | } 634 | } 635 | 636 | return { content }; 637 | } catch (error) { 638 | throw this.formatError(error, 'crawl'); 639 | } 640 | } 641 | } 642 | ``` -------------------------------------------------------------------------------- /src/__tests__/crawl4ai-service.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import nock from 'nock'; 2 | import { Crawl4AIService } from '../crawl4ai-service.js'; 3 | import type { 4 | MarkdownEndpointResponse, 5 | ScreenshotEndpointResponse, 6 | PDFEndpointResponse, 7 | HTMLEndpointResponse, 8 | CrawlEndpointResponse, 9 | } from '../types.js'; 10 | 11 | /** 12 | * Unit tests for Crawl4AIService using nock for HTTP mocking 13 | * 14 | * Mock Maintenance: 15 | * - These mocks are maintained manually based on the actual API responses 16 | * - When the API changes, update the mock responses to match 17 | * - Integration tests validate against the real API 18 | */ 19 | 20 | describe('Crawl4AIService', () => { 21 | let service: Crawl4AIService; 22 | // Unit tests always use localhost as configured in jest.setup.cjs 23 | const baseURL = 'http://localhost:11235'; 24 | const apiKey = 'test-api-key'; 25 | 26 | beforeEach(() => { 27 | service = new Crawl4AIService(baseURL, apiKey); 28 | // Clean all nock interceptors before each test 29 | nock.cleanAll(); 30 | }); 31 | 32 | afterEach(() => { 33 | // Clean up any remaining interceptors 34 | nock.cleanAll(); 35 | }); 36 | 37 | describe('getMarkdown', () => { 38 | it('should fetch markdown with default parameters', async () => { 39 | const mockResponse: MarkdownEndpointResponse = { 40 | url: 'https://example.com', 41 | filter: 'fit', 42 | query: null, 43 | cache: 'false', 44 | markdown: '# Example Page\n\nThis is example content.', 45 | success: true, 46 | }; 47 | 48 | // Mock the HTTP request 49 | nock(baseURL) 50 | .post('/md', { 51 | url: 'https://example.com', 52 | f: 'fit', 53 | q: undefined, 54 | c: undefined, 55 | }) 56 | .matchHeader('x-api-key', apiKey) 57 | .reply(200, mockResponse); 58 | 59 | const result = await service.getMarkdown({ 60 | url: 'https://example.com', 61 | f: 'fit', 62 | }); 63 | 64 | expect(result).toEqual(mockResponse); 65 | }); 66 | 67 | it('should fetch markdown with all parameters', async () => { 68 | const mockResponse: MarkdownEndpointResponse = { 69 | url: 'https://example.com', 70 | filter: 'bm25', 71 | query: 'test query', 72 | cache: 'true', 73 | markdown: '# Filtered Content\n\nMatching content for test query.', 74 | success: true, 75 | }; 76 | 77 | nock(baseURL) 78 | .post('/md', { 79 | url: 'https://example.com', 80 | f: 'bm25', 81 | q: 'test query', 82 | c: 'true', 83 | }) 84 | .matchHeader('x-api-key', apiKey) 85 | .reply(200, mockResponse); 86 | 87 | const result = await service.getMarkdown({ 88 | url: 'https://example.com', 89 | f: 'bm25', 90 | q: 'test query', 91 | c: 'true', 92 | }); 93 | 94 | expect(result).toEqual(mockResponse); 95 | }); 96 | 97 | it('should handle API errors', async () => { 98 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).reply(500, { detail: 'Internal server error' }); 99 | 100 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 101 | 'Request failed with status 500: Internal server error', 102 | ); 103 | }); 104 | 105 | it('should validate URL format', async () => { 106 | await expect(service.getMarkdown({ url: 'invalid-url' })).rejects.toThrow('Invalid URL format'); 107 | }); 108 | 109 | it('should handle network errors', async () => { 110 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError('Network error'); 111 | 112 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Network error'); 113 | }); 114 | }); 115 | 116 | describe('captureScreenshot', () => { 117 | it('should capture screenshot successfully', async () => { 118 | const mockResponse: ScreenshotEndpointResponse = { 119 | success: true, 120 | screenshot: 'base64-encoded-screenshot-data', 121 | }; 122 | 123 | nock(baseURL) 124 | .post('/screenshot', { 125 | url: 'https://example.com', 126 | screenshot_wait_for: 2, 127 | }) 128 | .matchHeader('x-api-key', apiKey) 129 | .reply(200, mockResponse); 130 | 131 | const result = await service.captureScreenshot({ 132 | url: 'https://example.com', 133 | screenshot_wait_for: 2, 134 | }); 135 | 136 | expect(result).toEqual(mockResponse); 137 | }); 138 | 139 | it('should validate URL format', async () => { 140 | await expect(service.captureScreenshot({ url: 'not-a-url' })).rejects.toThrow('Invalid URL format'); 141 | }); 142 | }); 143 | 144 | describe('generatePDF', () => { 145 | it('should generate PDF successfully', async () => { 146 | const mockResponse: PDFEndpointResponse = { 147 | success: true, 148 | pdf: 'base64-encoded-pdf-data', 149 | }; 150 | 151 | nock(baseURL) 152 | .post('/pdf', { 153 | url: 'https://example.com', 154 | }) 155 | .matchHeader('x-api-key', apiKey) 156 | .reply(200, mockResponse); 157 | 158 | const result = await service.generatePDF({ 159 | url: 'https://example.com', 160 | }); 161 | 162 | expect(result).toEqual(mockResponse); 163 | }); 164 | 165 | it('should validate URL format', async () => { 166 | await expect(service.generatePDF({ url: 'not a url' })).rejects.toThrow('Invalid URL format'); 167 | }); 168 | }); 169 | 170 | describe('getHTML', () => { 171 | it('should fetch HTML successfully', async () => { 172 | const mockResponse: HTMLEndpointResponse = { 173 | html: '<html><body><h1>Example</h1></body></html>', 174 | url: 'https://example.com', 175 | success: true, 176 | }; 177 | 178 | nock(baseURL) 179 | .post('/html', { 180 | url: 'https://example.com', 181 | }) 182 | .matchHeader('x-api-key', apiKey) 183 | .reply(200, mockResponse); 184 | 185 | const result = await service.getHTML({ 186 | url: 'https://example.com', 187 | }); 188 | 189 | expect(result).toEqual(mockResponse); 190 | }); 191 | 192 | it('should validate URL format', async () => { 193 | await expect(service.getHTML({ url: 'just text' })).rejects.toThrow('Invalid URL format'); 194 | }); 195 | }); 196 | 197 | describe('crawl', () => { 198 | it('should crawl with basic configuration', async () => { 199 | const mockResponse: CrawlEndpointResponse = { 200 | success: true, 201 | results: [ 202 | { 203 | url: 'https://example.com', 204 | html: '<html>...</html>', 205 | cleaned_html: '<html>...</html>', 206 | fit_html: '<html>...</html>', 207 | success: true, 208 | status_code: 200, 209 | response_headers: {}, 210 | session_id: null, 211 | metadata: {}, 212 | links: { internal: [], external: [] }, 213 | media: { images: [], videos: [], audios: [] }, 214 | markdown: { 215 | raw_markdown: '# Example', 216 | markdown_with_citations: '# Example [1]', 217 | references_markdown: '[1]: https://example.com', 218 | fit_markdown: '# Example', 219 | fit_html: '<h1>Example</h1>', 220 | }, 221 | tables: [], 222 | extracted_content: null, 223 | screenshot: null, 224 | pdf: null, 225 | mhtml: null, 226 | js_execution_result: null, 227 | downloaded_files: null, 228 | network_requests: null, 229 | console_messages: null, 230 | ssl_certificate: null, 231 | dispatch_result: null, 232 | }, 233 | ], 234 | server_processing_time_s: 1.5, 235 | server_memory_delta_mb: 10, 236 | server_peak_memory_mb: 100, 237 | }; 238 | 239 | nock(baseURL) 240 | .post('/crawl', { 241 | urls: ['https://example.com'], 242 | browser_config: { headless: true }, 243 | crawler_config: { cache_mode: 'ENABLED' }, 244 | }) 245 | .matchHeader('x-api-key', apiKey) 246 | .reply(200, mockResponse); 247 | 248 | const result = await service.crawl({ 249 | urls: ['https://example.com'], 250 | browser_config: { headless: true }, 251 | crawler_config: { cache_mode: 'ENABLED' }, 252 | }); 253 | 254 | expect(result).toEqual(mockResponse); 255 | }); 256 | 257 | it('should reject invalid JavaScript in crawler_config', async () => { 258 | await expect( 259 | service.crawl({ 260 | url: 'https://example.com', 261 | crawler_config: { 262 | js_code: 'console.log("test")', 263 | }, 264 | }), 265 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 266 | }); 267 | 268 | it('should handle js_code as array with invalid script', async () => { 269 | await expect( 270 | service.crawl({ 271 | url: 'https://example.com', 272 | crawler_config: { 273 | js_code: ['valid code', '<script>alert("test")</script>'], 274 | }, 275 | }), 276 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 277 | }); 278 | 279 | // Timeout testing is better suited for integration tests 280 | // where we can test against real API behavior 281 | }); 282 | 283 | describe('batchCrawl', () => { 284 | it('should batch crawl multiple URLs', async () => { 285 | const urls = ['https://example1.com', 'https://example2.com']; 286 | const mockResponse = { 287 | success: true, 288 | results: urls.map((url) => ({ 289 | url, 290 | success: true, 291 | markdown: { raw_markdown: `Content from ${url}` }, 292 | })), 293 | }; 294 | 295 | nock(baseURL) 296 | .post('/crawl', (body) => { 297 | return body.urls?.length === 2 && body.urls[0] === urls[0] && body.urls[1] === urls[1]; 298 | }) 299 | .matchHeader('x-api-key', apiKey) 300 | .reply(200, mockResponse); 301 | 302 | const result = await service.batchCrawl({ urls }); 303 | 304 | expect(result.success).toBe(true); 305 | expect(result.results).toHaveLength(2); 306 | }); 307 | 308 | it('should validate empty URLs array', async () => { 309 | await expect(service.batchCrawl({ urls: [] })).rejects.toThrow('URLs array cannot be empty'); 310 | }); 311 | }); 312 | 313 | describe('executeJS', () => { 314 | it('should execute JavaScript successfully', async () => { 315 | const mockResponse = { 316 | success: true, 317 | js_execution_result: { 318 | success: true, 319 | results: ['Example Title'], 320 | }, 321 | markdown: '# Example Page', 322 | }; 323 | 324 | nock(baseURL) 325 | .post('/execute_js', { 326 | url: 'https://example.com', 327 | scripts: ['return document.title'], 328 | }) 329 | .matchHeader('x-api-key', apiKey) 330 | .reply(200, mockResponse); 331 | 332 | const result = await service.executeJS({ 333 | url: 'https://example.com', 334 | scripts: 'return document.title', 335 | }); 336 | 337 | expect(result).toEqual(mockResponse); 338 | }); 339 | 340 | it('should handle array of scripts', async () => { 341 | const scripts = ['return document.title', 'return window.location.href']; 342 | const mockResponse = { 343 | success: true, 344 | js_execution_result: { 345 | success: true, 346 | results: ['Example Title', 'https://example.com'], 347 | }, 348 | }; 349 | 350 | nock(baseURL) 351 | .post('/execute_js', { 352 | url: 'https://example.com', 353 | scripts: scripts, 354 | }) 355 | .matchHeader('x-api-key', apiKey) 356 | .reply(200, mockResponse); 357 | 358 | const result = await service.executeJS({ 359 | url: 'https://example.com', 360 | scripts, 361 | }); 362 | 363 | expect(result).toEqual(mockResponse); 364 | }); 365 | 366 | it('should reject scripts with HTML entities', async () => { 367 | await expect( 368 | service.executeJS({ 369 | url: 'https://httpbin.org/html', 370 | scripts: 'console.log("test")', 371 | }), 372 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 373 | }); 374 | 375 | it('should reject scripts with HTML tags', async () => { 376 | await expect( 377 | service.executeJS({ 378 | url: 'https://httpbin.org/html', 379 | scripts: '<script>alert("test")</script>', 380 | }), 381 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 382 | }); 383 | 384 | it('should reject scripts with literal \\n', async () => { 385 | await expect( 386 | service.executeJS({ 387 | url: 'https://httpbin.org/html', 388 | scripts: 'console.log("test");\\nconsole.log("test2");', 389 | }), 390 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 391 | }); 392 | 393 | it('should reject array with invalid scripts', async () => { 394 | await expect( 395 | service.executeJS({ 396 | url: 'https://httpbin.org/html', 397 | scripts: ['valid script', 'console.log(&& true)'], 398 | }), 399 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 400 | }); 401 | 402 | it('should validate URL format', async () => { 403 | await expect(service.executeJS({ url: '//no-protocol', scripts: 'return 1' })).rejects.toThrow( 404 | 'Invalid URL format', 405 | ); 406 | }); 407 | 408 | it('should reject scripts with escaped backslash-n pattern', async () => { 409 | // Test the specific pattern that line 40-41 checks for: })\\nword 410 | const scriptWithPattern = 'function test() {}\\nconsole.log("test")'; 411 | await expect( 412 | service.executeJS({ 413 | url: 'https://example.com', 414 | scripts: scriptWithPattern, 415 | }), 416 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 417 | }); 418 | 419 | it('should allow valid JavaScript with actual newlines', async () => { 420 | const validScript = `function test() { 421 | console.log("This has real newlines"); 422 | return true; 423 | }`; 424 | 425 | const mockResponse = { 426 | success: true, 427 | js_execution_result: { results: [true] }, 428 | }; 429 | 430 | nock(baseURL).post('/execute_js').matchHeader('x-api-key', apiKey).reply(200, mockResponse); 431 | 432 | const result = await service.executeJS({ 433 | url: 'https://example.com', 434 | scripts: validScript, 435 | }); 436 | 437 | expect(result.success).toBe(true); 438 | }); 439 | }); 440 | 441 | describe('extractWithLLM', () => { 442 | it('should extract content with LLM', async () => { 443 | const mockResponse = { 444 | answer: 'The main topic of this page is JavaScript testing.', 445 | }; 446 | 447 | nock(baseURL) 448 | .get('/llm/https%3A%2F%2Fexample.com?q=What%20is%20the%20main%20topic%3F') 449 | .matchHeader('x-api-key', apiKey) 450 | .reply(200, mockResponse); 451 | 452 | const result = await service.extractWithLLM({ 453 | url: 'https://example.com', 454 | query: 'What is the main topic?', 455 | }); 456 | 457 | expect(result).toEqual(mockResponse); 458 | }); 459 | 460 | // Timeout testing moved to integration tests 461 | 462 | it('should handle missing LLM provider', async () => { 463 | nock(baseURL) 464 | .get(/\/llm\/.*/) 465 | .matchHeader('x-api-key', apiKey) 466 | .reply(401, { detail: 'No LLM provider configured' }); 467 | 468 | await expect( 469 | service.extractWithLLM({ 470 | url: 'https://example.com', 471 | query: 'test', 472 | }), 473 | ).rejects.toThrow('No LLM provider configured'); 474 | }); 475 | }); 476 | 477 | describe('Browser Configuration', () => { 478 | it('should send cookies configuration correctly', async () => { 479 | const mockResponse: CrawlEndpointResponse = { 480 | success: true, 481 | results: [ 482 | { 483 | url: 'https://httpbin.org/cookies', 484 | html: '<html>...</html>', 485 | cleaned_html: '<html>...</html>', 486 | fit_html: '<html>...</html>', 487 | success: true, 488 | status_code: 200, 489 | response_headers: {}, 490 | session_id: null, 491 | metadata: {}, 492 | links: { internal: [], external: [] }, 493 | media: { images: [], videos: [], audios: [] }, 494 | markdown: { 495 | raw_markdown: '{"cookies": {"test": "value"}}', 496 | markdown_with_citations: '', 497 | references_markdown: '', 498 | fit_markdown: '{"cookies": {"test": "value"}}', 499 | fit_html: '', 500 | }, 501 | tables: [], 502 | extracted_content: null, 503 | screenshot: null, 504 | pdf: null, 505 | mhtml: null, 506 | js_execution_result: null, 507 | downloaded_files: null, 508 | network_requests: null, 509 | console_messages: null, 510 | ssl_certificate: null, 511 | dispatch_result: null, 512 | }, 513 | ], 514 | server_processing_time_s: 1.0, 515 | server_memory_delta_mb: 5, 516 | server_peak_memory_mb: 50, 517 | }; 518 | 519 | nock(baseURL) 520 | .post('/crawl', { 521 | urls: ['https://httpbin.org/cookies'], 522 | browser_config: { 523 | headless: true, 524 | cookies: [ 525 | { 526 | name: 'test', 527 | value: 'value', 528 | domain: '.httpbin.org', 529 | path: '/', 530 | }, 531 | ], 532 | }, 533 | crawler_config: {}, 534 | }) 535 | .matchHeader('x-api-key', apiKey) 536 | .reply(200, mockResponse); 537 | 538 | const result = await service.crawl({ 539 | urls: ['https://httpbin.org/cookies'], 540 | browser_config: { 541 | headless: true, 542 | cookies: [ 543 | { 544 | name: 'test', 545 | value: 'value', 546 | domain: '.httpbin.org', 547 | path: '/', 548 | }, 549 | ], 550 | }, 551 | crawler_config: {}, 552 | }); 553 | 554 | expect(result.success).toBe(true); 555 | expect(result.results[0].markdown?.raw_markdown).toContain('cookies'); 556 | }); 557 | 558 | it('should send headers configuration correctly', async () => { 559 | const mockResponse: CrawlEndpointResponse = { 560 | success: true, 561 | results: [ 562 | { 563 | url: 'https://httpbin.org/headers', 564 | html: '<html>...</html>', 565 | cleaned_html: '<html>...</html>', 566 | fit_html: '<html>...</html>', 567 | success: true, 568 | status_code: 200, 569 | response_headers: {}, 570 | session_id: null, 571 | metadata: {}, 572 | links: { internal: [], external: [] }, 573 | media: { images: [], videos: [], audios: [] }, 574 | markdown: { 575 | raw_markdown: '{"headers": {"X-Custom": "test-value"}}', 576 | markdown_with_citations: '', 577 | references_markdown: '', 578 | fit_markdown: '{"headers": {"X-Custom": "test-value"}}', 579 | fit_html: '', 580 | }, 581 | tables: [], 582 | extracted_content: null, 583 | screenshot: null, 584 | pdf: null, 585 | mhtml: null, 586 | js_execution_result: null, 587 | downloaded_files: null, 588 | network_requests: null, 589 | console_messages: null, 590 | ssl_certificate: null, 591 | dispatch_result: null, 592 | }, 593 | ], 594 | server_processing_time_s: 1.0, 595 | server_memory_delta_mb: 5, 596 | server_peak_memory_mb: 50, 597 | }; 598 | 599 | nock(baseURL) 600 | .post('/crawl', { 601 | urls: ['https://httpbin.org/headers'], 602 | browser_config: { 603 | headless: true, 604 | headers: { 605 | 'X-Custom': 'test-value', 606 | 'X-Request-ID': '12345', 607 | }, 608 | }, 609 | crawler_config: {}, 610 | }) 611 | .matchHeader('x-api-key', apiKey) 612 | .reply(200, mockResponse); 613 | 614 | const result = await service.crawl({ 615 | urls: ['https://httpbin.org/headers'], 616 | browser_config: { 617 | headless: true, 618 | headers: { 619 | 'X-Custom': 'test-value', 620 | 'X-Request-ID': '12345', 621 | }, 622 | }, 623 | crawler_config: {}, 624 | }); 625 | 626 | expect(result.success).toBe(true); 627 | expect(result.results[0].markdown?.raw_markdown).toContain('headers'); 628 | }); 629 | 630 | it('should send viewport configuration correctly', async () => { 631 | const mockResponse: CrawlEndpointResponse = { 632 | success: true, 633 | results: [ 634 | { 635 | url: 'https://example.com', 636 | html: '<html>...</html>', 637 | cleaned_html: '<html>...</html>', 638 | fit_html: '<html>...</html>', 639 | success: true, 640 | status_code: 200, 641 | response_headers: {}, 642 | session_id: null, 643 | metadata: {}, 644 | links: { internal: [], external: [] }, 645 | media: { images: [], videos: [], audios: [] }, 646 | markdown: { 647 | raw_markdown: 'Content', 648 | markdown_with_citations: '', 649 | references_markdown: '', 650 | fit_markdown: 'Content', 651 | fit_html: '', 652 | }, 653 | tables: [], 654 | extracted_content: null, 655 | screenshot: 'base64-screenshot-data', 656 | pdf: null, 657 | mhtml: null, 658 | js_execution_result: null, 659 | downloaded_files: null, 660 | network_requests: null, 661 | console_messages: null, 662 | ssl_certificate: null, 663 | dispatch_result: null, 664 | }, 665 | ], 666 | server_processing_time_s: 2.0, 667 | server_memory_delta_mb: 10, 668 | server_peak_memory_mb: 100, 669 | }; 670 | 671 | nock(baseURL) 672 | .post('/crawl', { 673 | urls: ['https://example.com'], 674 | browser_config: { 675 | headless: true, 676 | viewport_width: 375, 677 | viewport_height: 667, 678 | }, 679 | crawler_config: { 680 | screenshot: true, 681 | }, 682 | }) 683 | .matchHeader('x-api-key', apiKey) 684 | .reply(200, mockResponse); 685 | 686 | const result = await service.crawl({ 687 | urls: ['https://example.com'], 688 | browser_config: { 689 | headless: true, 690 | viewport_width: 375, 691 | viewport_height: 667, 692 | }, 693 | crawler_config: { 694 | screenshot: true, 695 | }, 696 | }); 697 | 698 | expect(result.success).toBe(true); 699 | expect(result.results[0].screenshot).toBeTruthy(); 700 | }); 701 | 702 | it('should send user agent configuration correctly', async () => { 703 | const mockResponse: CrawlEndpointResponse = { 704 | success: true, 705 | results: [ 706 | { 707 | url: 'https://httpbin.org/user-agent', 708 | html: '<html>...</html>', 709 | cleaned_html: '<html>...</html>', 710 | fit_html: '<html>...</html>', 711 | success: true, 712 | status_code: 200, 713 | response_headers: {}, 714 | session_id: null, 715 | metadata: {}, 716 | links: { internal: [], external: [] }, 717 | media: { images: [], videos: [], audios: [] }, 718 | markdown: { 719 | raw_markdown: '{"user-agent": "Custom-Bot/1.0"}', 720 | markdown_with_citations: '', 721 | references_markdown: '', 722 | fit_markdown: '{"user-agent": "Custom-Bot/1.0"}', 723 | fit_html: '', 724 | }, 725 | tables: [], 726 | extracted_content: null, 727 | screenshot: null, 728 | pdf: null, 729 | mhtml: null, 730 | js_execution_result: null, 731 | downloaded_files: null, 732 | network_requests: null, 733 | console_messages: null, 734 | ssl_certificate: null, 735 | dispatch_result: null, 736 | }, 737 | ], 738 | server_processing_time_s: 1.0, 739 | server_memory_delta_mb: 5, 740 | server_peak_memory_mb: 50, 741 | }; 742 | 743 | nock(baseURL) 744 | .post('/crawl', { 745 | urls: ['https://httpbin.org/user-agent'], 746 | browser_config: { 747 | headless: true, 748 | user_agent: 'Custom-Bot/1.0', 749 | }, 750 | crawler_config: {}, 751 | }) 752 | .matchHeader('x-api-key', apiKey) 753 | .reply(200, mockResponse); 754 | 755 | const result = await service.crawl({ 756 | urls: ['https://httpbin.org/user-agent'], 757 | browser_config: { 758 | headless: true, 759 | user_agent: 'Custom-Bot/1.0', 760 | }, 761 | crawler_config: {}, 762 | }); 763 | 764 | expect(result.success).toBe(true); 765 | expect(result.results[0].markdown?.raw_markdown).toContain('Custom-Bot/1.0'); 766 | }); 767 | 768 | it('should handle complex browser configuration', async () => { 769 | const mockResponse: CrawlEndpointResponse = { 770 | success: true, 771 | results: [ 772 | { 773 | url: 'https://httpbin.org/anything', 774 | html: '<html>...</html>', 775 | cleaned_html: '<html>...</html>', 776 | fit_html: '<html>...</html>', 777 | success: true, 778 | status_code: 200, 779 | response_headers: {}, 780 | session_id: null, 781 | metadata: {}, 782 | links: { internal: [], external: [] }, 783 | media: { images: [], videos: [], audios: [] }, 784 | markdown: { 785 | raw_markdown: 'Response with all configs', 786 | markdown_with_citations: '', 787 | references_markdown: '', 788 | fit_markdown: 'Response with all configs', 789 | fit_html: '', 790 | }, 791 | tables: [], 792 | extracted_content: null, 793 | screenshot: null, 794 | pdf: null, 795 | mhtml: null, 796 | js_execution_result: null, 797 | downloaded_files: null, 798 | network_requests: null, 799 | console_messages: null, 800 | ssl_certificate: null, 801 | dispatch_result: null, 802 | }, 803 | ], 804 | server_processing_time_s: 1.5, 805 | server_memory_delta_mb: 8, 806 | server_peak_memory_mb: 80, 807 | }; 808 | 809 | const complexConfig = { 810 | urls: ['https://httpbin.org/anything'], 811 | browser_config: { 812 | headless: true, 813 | viewport_width: 768, 814 | viewport_height: 1024, 815 | user_agent: 'Test-Bot/2.0', 816 | cookies: [ 817 | { 818 | name: 'session', 819 | value: 'abc123', 820 | domain: '.httpbin.org', 821 | path: '/', 822 | }, 823 | ], 824 | headers: { 825 | 'X-Test': 'value', 826 | }, 827 | }, 828 | crawler_config: { 829 | cache_mode: 'BYPASS' as const, 830 | }, 831 | }; 832 | 833 | nock(baseURL).post('/crawl', complexConfig).matchHeader('x-api-key', apiKey).reply(200, mockResponse); 834 | 835 | const result = await service.crawl(complexConfig); 836 | 837 | expect(result.success).toBe(true); 838 | expect(result.results).toHaveLength(1); 839 | }); 840 | }); 841 | 842 | describe('Crawler Configuration Advanced Parameters', () => { 843 | it('should send content filtering parameters correctly', async () => { 844 | const mockResponse: CrawlEndpointResponse = { 845 | success: true, 846 | results: [ 847 | { 848 | url: 'https://httpbin.org/forms/post', 849 | html: '<html>...</html>', 850 | cleaned_html: '<html>...</html>', 851 | fit_html: '<html>...</html>', 852 | success: true, 853 | status_code: 200, 854 | response_headers: {}, 855 | session_id: null, 856 | metadata: {}, 857 | links: { internal: [], external: [] }, 858 | media: { images: [], videos: [], audios: [] }, 859 | markdown: { 860 | raw_markdown: 'Form content without forms', 861 | markdown_with_citations: '', 862 | references_markdown: '', 863 | fit_markdown: 'Form content without forms', 864 | fit_html: '', 865 | }, 866 | tables: [], 867 | extracted_content: null, 868 | screenshot: null, 869 | pdf: null, 870 | mhtml: null, 871 | js_execution_result: null, 872 | downloaded_files: null, 873 | network_requests: null, 874 | console_messages: null, 875 | ssl_certificate: null, 876 | dispatch_result: null, 877 | }, 878 | ], 879 | server_processing_time_s: 1.0, 880 | server_memory_delta_mb: 5, 881 | server_peak_memory_mb: 50, 882 | }; 883 | 884 | nock(baseURL) 885 | .post('/crawl', { 886 | urls: ['https://httpbin.org/forms/post'], 887 | browser_config: { 888 | headless: true, 889 | }, 890 | crawler_config: { 891 | remove_forms: true, 892 | keep_data_attributes: true, 893 | exclude_external_images: true, 894 | }, 895 | }) 896 | .matchHeader('x-api-key', apiKey) 897 | .reply(200, mockResponse); 898 | 899 | const result = await service.crawl({ 900 | urls: ['https://httpbin.org/forms/post'], 901 | browser_config: { 902 | headless: true, 903 | }, 904 | crawler_config: { 905 | remove_forms: true, 906 | keep_data_attributes: true, 907 | exclude_external_images: true, 908 | }, 909 | }); 910 | 911 | expect(result.success).toBe(true); 912 | }); 913 | 914 | it('should send js_only parameter correctly', async () => { 915 | const mockResponse: CrawlEndpointResponse = { 916 | success: true, 917 | results: [ 918 | { 919 | url: 'https://httpbin.org/html', 920 | html: '', 921 | cleaned_html: '', 922 | fit_html: '', 923 | success: true, 924 | status_code: 200, 925 | response_headers: {}, 926 | session_id: null, 927 | metadata: {}, 928 | links: { internal: [], external: [] }, 929 | media: { images: [], videos: [], audios: [] }, 930 | markdown: { 931 | raw_markdown: '', 932 | markdown_with_citations: '', 933 | references_markdown: '', 934 | fit_markdown: '', 935 | fit_html: '', 936 | }, 937 | tables: [], 938 | extracted_content: null, 939 | screenshot: null, 940 | pdf: null, 941 | mhtml: null, 942 | js_execution_result: { 943 | success: true, 944 | results: ['Page Title', '5'], 945 | }, 946 | downloaded_files: null, 947 | network_requests: null, 948 | console_messages: null, 949 | ssl_certificate: null, 950 | dispatch_result: null, 951 | }, 952 | ], 953 | server_processing_time_s: 1.0, 954 | server_memory_delta_mb: 5, 955 | server_peak_memory_mb: 50, 956 | }; 957 | 958 | nock(baseURL) 959 | .post('/crawl', { 960 | urls: ['https://httpbin.org/html'], 961 | browser_config: { 962 | headless: true, 963 | }, 964 | crawler_config: { 965 | js_code: ['return document.title', 'return document.querySelectorAll("p").length'], 966 | js_only: true, 967 | }, 968 | }) 969 | .matchHeader('x-api-key', apiKey) 970 | .reply(200, mockResponse); 971 | 972 | const result = await service.crawl({ 973 | urls: ['https://httpbin.org/html'], 974 | browser_config: { 975 | headless: true, 976 | }, 977 | crawler_config: { 978 | js_code: ['return document.title', 'return document.querySelectorAll("p").length'], 979 | js_only: true, 980 | }, 981 | }); 982 | 983 | expect(result.success).toBe(true); 984 | expect(result.results[0].js_execution_result).toBeDefined(); 985 | }); 986 | 987 | it('should send visibility and debug parameters correctly', async () => { 988 | const mockResponse: CrawlEndpointResponse = { 989 | success: true, 990 | results: [ 991 | { 992 | url: 'https://httpbin.org/html', 993 | html: '<html>...</html>', 994 | cleaned_html: '<html>...</html>', 995 | fit_html: '<html>...</html>', 996 | success: true, 997 | status_code: 200, 998 | response_headers: {}, 999 | session_id: null, 1000 | metadata: {}, 1001 | links: { internal: [], external: [] }, 1002 | media: { images: [], videos: [], audios: [] }, 1003 | markdown: { 1004 | raw_markdown: 'Content', 1005 | markdown_with_citations: '', 1006 | references_markdown: '', 1007 | fit_markdown: 'Content', 1008 | fit_html: '', 1009 | }, 1010 | tables: [], 1011 | extracted_content: null, 1012 | screenshot: null, 1013 | pdf: null, 1014 | mhtml: null, 1015 | js_execution_result: null, 1016 | downloaded_files: null, 1017 | network_requests: null, 1018 | console_messages: ['Test log message 1', 'Test warning', 'Test error'], 1019 | ssl_certificate: null, 1020 | dispatch_result: null, 1021 | }, 1022 | ], 1023 | server_processing_time_s: 1.5, 1024 | server_memory_delta_mb: 8, 1025 | server_peak_memory_mb: 80, 1026 | }; 1027 | 1028 | nock(baseURL) 1029 | .post('/crawl', { 1030 | urls: ['https://httpbin.org/html'], 1031 | browser_config: { 1032 | headless: true, 1033 | }, 1034 | crawler_config: { 1035 | ignore_body_visibility: true, 1036 | verbose: true, 1037 | log_console: true, 1038 | }, 1039 | }) 1040 | .matchHeader('x-api-key', apiKey) 1041 | .reply(200, mockResponse); 1042 | 1043 | const result = await service.crawl({ 1044 | urls: ['https://httpbin.org/html'], 1045 | browser_config: { 1046 | headless: true, 1047 | }, 1048 | crawler_config: { 1049 | ignore_body_visibility: true, 1050 | verbose: true, 1051 | log_console: true, 1052 | }, 1053 | }); 1054 | 1055 | expect(result.success).toBe(true); 1056 | expect(result.results[0].console_messages).toBeDefined(); 1057 | }); 1058 | }); 1059 | 1060 | describe('parseSitemap', () => { 1061 | it('should fetch and return sitemap content', async () => { 1062 | const mockSitemapXML = `<?xml version="1.0" encoding="UTF-8"?> 1063 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 1064 | <url><loc>https://example.com/page1</loc></url> 1065 | <url><loc>https://example.com/page2</loc></url> 1066 | </urlset>`; 1067 | 1068 | // parseSitemap now uses axios directly without baseURL 1069 | nock('https://example.com').get('/sitemap.xml').reply(200, mockSitemapXML); 1070 | 1071 | const response = await service.parseSitemap('https://example.com/sitemap.xml'); 1072 | expect(response).toBe(mockSitemapXML); 1073 | }); 1074 | 1075 | it('should handle sitemap fetch errors', async () => { 1076 | nock('https://example.com').get('/sitemap.xml').reply(404, 'Not Found'); 1077 | 1078 | await expect(service.parseSitemap('https://example.com/sitemap.xml')).rejects.toThrow(); 1079 | }); 1080 | }); 1081 | 1082 | describe('detectContentType', () => { 1083 | it('should return content type from HEAD request', async () => { 1084 | // detectContentType now uses axios directly without baseURL 1085 | nock('https://example.com').head('/document.pdf').reply(200, '', { 'content-type': 'application/pdf' }); 1086 | 1087 | const contentType = await service.detectContentType('https://example.com/document.pdf'); 1088 | expect(contentType).toBe('application/pdf'); 1089 | }); 1090 | 1091 | it('should return empty string when content-type header is missing', async () => { 1092 | nock('https://example.com').head('/file').reply(200, ''); 1093 | 1094 | const contentType = await service.detectContentType('https://example.com/file'); 1095 | expect(contentType).toBe(''); 1096 | }); 1097 | 1098 | it('should return empty string on HEAD request failure', async () => { 1099 | nock('https://example.com').head('/file').reply(404, 'Not Found'); 1100 | 1101 | const contentType = await service.detectContentType('https://example.com/file'); 1102 | expect(contentType).toBe(''); 1103 | }); 1104 | }); 1105 | 1106 | describe('Network Error Handling', () => { 1107 | it('should handle ECONNABORTED error', async () => { 1108 | const error = new Error('Connection aborted') as Error & { code?: string }; 1109 | error.code = 'ECONNABORTED'; 1110 | 1111 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1112 | 1113 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Request timed out'); 1114 | }); 1115 | 1116 | it('should handle ETIMEDOUT error', async () => { 1117 | const error = new Error('Socket timed out') as Error & { code?: string }; 1118 | error.code = 'ETIMEDOUT'; 1119 | 1120 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1121 | 1122 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Request timeout'); 1123 | }); 1124 | 1125 | it('should handle ENOTFOUND error', async () => { 1126 | const error = new Error('getaddrinfo ENOTFOUND') as Error & { code?: string }; 1127 | error.code = 'ENOTFOUND'; 1128 | 1129 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1130 | 1131 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('DNS resolution failed'); 1132 | }); 1133 | 1134 | it('should handle ECONNREFUSED error', async () => { 1135 | const error = new Error('connect ECONNREFUSED') as Error & { code?: string }; 1136 | error.code = 'ECONNREFUSED'; 1137 | 1138 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1139 | 1140 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Connection refused'); 1141 | }); 1142 | 1143 | it('should handle ECONNRESET error', async () => { 1144 | const error = new Error('socket hang up') as Error & { code?: string }; 1145 | error.code = 'ECONNRESET'; 1146 | 1147 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1148 | 1149 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Connection reset'); 1150 | }); 1151 | 1152 | it('should handle ENETUNREACH error', async () => { 1153 | const error = new Error('Network is unreachable') as Error & { code?: string }; 1154 | error.code = 'ENETUNREACH'; 1155 | 1156 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1157 | 1158 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Network unreachable'); 1159 | }); 1160 | 1161 | it('should handle generic axios errors', async () => { 1162 | const error = new Error('Generic error') as Error & { isAxiosError?: boolean }; 1163 | error.isAxiosError = true; 1164 | 1165 | nock(baseURL).post('/md').matchHeader('x-api-key', apiKey).replyWithError(error); 1166 | 1167 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Generic error'); 1168 | }); 1169 | }); 1170 | 1171 | describe('Optional Parameter Handling', () => { 1172 | it('should handle batchCrawl with remove_images option', async () => { 1173 | const urls = ['https://example.com']; 1174 | 1175 | nock(baseURL) 1176 | .post('/crawl', (body) => { 1177 | return body.crawler_config?.exclude_tags?.includes('img'); 1178 | }) 1179 | .matchHeader('x-api-key', apiKey) 1180 | .reply(200, { success: true, results: [] }); 1181 | 1182 | await service.batchCrawl({ urls, remove_images: true }); 1183 | }); 1184 | 1185 | it('should handle batchCrawl with bypass_cache option', async () => { 1186 | const urls = ['https://example.com']; 1187 | 1188 | nock(baseURL) 1189 | .post('/crawl', (body) => { 1190 | return body.crawler_config?.cache_mode === 'BYPASS'; 1191 | }) 1192 | .matchHeader('x-api-key', apiKey) 1193 | .reply(200, { success: true, results: [] }); 1194 | 1195 | await service.batchCrawl({ urls, bypass_cache: true }); 1196 | }); 1197 | 1198 | it('should test edge case JavaScript validation pattern', async () => { 1199 | // Test the specific pattern on line 40-41: })\\nword 1200 | const scriptWithEdgeCase = 'if (true) {}\\nwindow.alert("test")'; 1201 | await expect( 1202 | service.executeJS({ 1203 | url: 'https://example.com', 1204 | scripts: scriptWithEdgeCase, 1205 | }), 1206 | ).rejects.toThrow('Invalid JavaScript: Contains HTML entities'); 1207 | }); 1208 | 1209 | it('should include memory metrics in crawl response', async () => { 1210 | const mockResponse: CrawlEndpointResponse = { 1211 | success: true, 1212 | results: [ 1213 | { 1214 | url: 'https://example.com', 1215 | html: '<html>Test</html>', 1216 | cleaned_html: '<html>Test</html>', 1217 | fit_html: '<html>Test</html>', 1218 | success: true, 1219 | status_code: 200, 1220 | response_headers: {}, 1221 | session_id: null, 1222 | metadata: {}, 1223 | links: { internal: [], external: [] }, 1224 | media: { images: [], videos: [], audios: [] }, 1225 | markdown: { 1226 | raw_markdown: 'Test content', 1227 | markdown_with_citations: '', 1228 | references_markdown: '', 1229 | fit_markdown: 'Test content', 1230 | fit_html: '', 1231 | }, 1232 | tables: [], 1233 | extracted_content: null, 1234 | screenshot: null, 1235 | pdf: null, 1236 | mhtml: null, 1237 | js_execution_result: null, 1238 | downloaded_files: null, 1239 | network_requests: null, 1240 | console_messages: null, 1241 | ssl_certificate: null, 1242 | dispatch_result: null, 1243 | }, 1244 | ], 1245 | server_processing_time_s: 2.5, 1246 | server_memory_delta_mb: 15.3, 1247 | server_peak_memory_mb: 512.7, 1248 | }; 1249 | 1250 | nock(baseURL).post('/crawl').matchHeader('x-api-key', apiKey).reply(200, mockResponse); 1251 | 1252 | const result = await service.crawl({ url: 'https://example.com' }); 1253 | 1254 | expect(result.server_processing_time_s).toBe(2.5); 1255 | expect(result.server_memory_delta_mb).toBe(15.3); 1256 | expect(result.server_peak_memory_mb).toBe(512.7); 1257 | }); 1258 | }); 1259 | }); 1260 | ``` -------------------------------------------------------------------------------- /src/server.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; 3 | import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js'; 4 | import axios, { AxiosInstance } from 'axios'; 5 | import { z } from 'zod'; 6 | import { Crawl4AIService } from './crawl4ai-service.js'; 7 | import { SessionInfo } from './handlers/base-handler.js'; 8 | import { ContentHandlers } from './handlers/content-handlers.js'; 9 | import { SessionHandlers } from './handlers/session-handlers.js'; 10 | import { UtilityHandlers } from './handlers/utility-handlers.js'; 11 | import { CrawlHandlers } from './handlers/crawl-handlers.js'; 12 | import { BatchCrawlOptions } from './types.js'; 13 | // Define the tool call result type 14 | type ToolCallResult = { 15 | content: Array<{ 16 | type: string; 17 | text?: string; 18 | data?: string; 19 | mimeType?: string; 20 | }>; 21 | session_id?: string; 22 | browser_type?: string; 23 | }; 24 | import { 25 | GetMarkdownSchema, 26 | CaptureScreenshotSchema, 27 | GeneratePdfSchema, 28 | ExecuteJsSchema, 29 | BatchCrawlSchema, 30 | SmartCrawlSchema, 31 | GetHtmlSchema, 32 | ExtractLinksSchema, 33 | CrawlRecursiveSchema, 34 | ParseSitemapSchema, 35 | CrawlSchema, 36 | ManageSessionSchema, 37 | ExtractWithLlmSchema, 38 | } from './schemas/validation-schemas.js'; 39 | 40 | export class Crawl4AIServer { 41 | private server: Server; 42 | protected axiosClient: AxiosInstance; 43 | protected service: Crawl4AIService; 44 | private sessions: Map<string, SessionInfo> = new Map(); 45 | private serverName: string; 46 | private serverVersion: string; 47 | 48 | // Handler instances 49 | private contentHandlers: ContentHandlers; 50 | private sessionHandlers: SessionHandlers; 51 | private utilityHandlers: UtilityHandlers; 52 | private crawlHandlers: CrawlHandlers; 53 | 54 | constructor(baseUrl: string, apiKey: string, serverName: string = 'crawl4ai-mcp', serverVersion: string = '1.0.0') { 55 | this.serverName = serverName; 56 | this.serverVersion = serverVersion; 57 | this.server = new Server( 58 | { 59 | name: serverName, 60 | version: serverVersion, 61 | }, 62 | { 63 | capabilities: { 64 | tools: {}, 65 | }, 66 | }, 67 | ); 68 | 69 | // Initialize axios client with API key 70 | this.axiosClient = axios.create({ 71 | baseURL: baseUrl, 72 | headers: { 73 | 'X-API-Key': apiKey, 74 | 'Content-Type': 'application/json', 75 | }, 76 | timeout: 120000, // 2 minutes timeout 77 | }); 78 | 79 | // Initialize the service 80 | this.service = new Crawl4AIService(baseUrl, apiKey); 81 | 82 | // Initialize handlers 83 | this.contentHandlers = new ContentHandlers(this.service, this.axiosClient, this.sessions); 84 | this.sessionHandlers = new SessionHandlers(this.service, this.axiosClient, this.sessions); 85 | this.utilityHandlers = new UtilityHandlers(this.service, this.axiosClient, this.sessions); 86 | this.crawlHandlers = new CrawlHandlers(this.service, this.axiosClient, this.sessions); 87 | 88 | this.setupHandlers(); 89 | } 90 | 91 | /** 92 | * Helper method to validate arguments and execute handler with consistent error formatting 93 | * Preserves the exact error message format that LLMs rely on 94 | */ 95 | private async validateAndExecute<T>( 96 | toolName: string, 97 | args: unknown, 98 | schema: z.ZodSchema<T>, 99 | handler: (validatedArgs: T) => Promise<ToolCallResult>, 100 | ): Promise<ToolCallResult> { 101 | try { 102 | const validatedArgs = schema.parse(args); 103 | return await handler(validatedArgs); 104 | } catch (error) { 105 | if (error instanceof z.ZodError) { 106 | // EXACT same formatting as before - critical for LLM understanding 107 | const details = error.errors 108 | .map((e) => (e.path.length > 0 ? `${e.path.join('.')}: ${e.message}` : e.message)) 109 | .join(', '); 110 | throw new Error(`Invalid parameters for ${toolName}: ${details}`); 111 | } 112 | throw error; 113 | } 114 | } 115 | 116 | private setupHandlers() { 117 | // Handle list tools request 118 | this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ 119 | tools: [ 120 | { 121 | name: 'get_markdown', 122 | description: 123 | '[STATELESS] Extract content as markdown with filtering options. Supports: raw (full content), fit (optimized, default), bm25 (keyword search), llm (AI-powered extraction). Use bm25/llm with query for specific content. Creates new browser each time. For persistence use create_session + crawl.', 124 | inputSchema: { 125 | type: 'object', 126 | properties: { 127 | url: { 128 | type: 'string', 129 | description: 'The URL to extract markdown from', 130 | }, 131 | filter: { 132 | type: 'string', 133 | enum: ['raw', 'fit', 'bm25', 'llm'], 134 | description: 'Filter type: raw (full), fit (optimized), bm25 (search), llm (AI extraction)', 135 | default: 'fit', 136 | }, 137 | query: { 138 | type: 'string', 139 | description: 'Query string for bm25/llm filters. Required when using bm25 or llm filter.', 140 | }, 141 | cache: { 142 | type: 'string', 143 | description: 'Cache-bust parameter (use different values to force fresh extraction)', 144 | default: '0', 145 | }, 146 | }, 147 | required: ['url'], 148 | }, 149 | }, 150 | { 151 | name: 'capture_screenshot', 152 | description: 153 | "[STATELESS] Capture webpage screenshot. Returns base64-encoded PNG data. Creates new browser each time. Optionally saves screenshot to local directory. IMPORTANT: Chained calls (execute_js then capture_screenshot) will NOT work - the screenshot won't see JS changes! For JS changes + screenshot use create_session + crawl(session_id, js_code, screenshot:true) in ONE call.", 154 | inputSchema: { 155 | type: 'object', 156 | properties: { 157 | url: { 158 | type: 'string', 159 | description: 'The URL to capture', 160 | }, 161 | screenshot_wait_for: { 162 | type: 'number', 163 | description: 'Seconds to wait before taking screenshot (allows page loading/animations)', 164 | default: 2, 165 | }, 166 | save_to_directory: { 167 | type: 'string', 168 | description: 169 | "Directory path to save screenshot (e.g., ~/Desktop, /tmp). Do NOT include filename - it will be auto-generated. Large screenshots (>800KB) won't be returned inline when saved.", 170 | }, 171 | }, 172 | required: ['url'], 173 | }, 174 | }, 175 | { 176 | name: 'generate_pdf', 177 | description: 178 | '[STATELESS] Convert webpage to PDF. Returns base64-encoded PDF data. Creates new browser each time. Cannot capture form fills or JS changes. For persistent PDFs use create_session + crawl(session_id, pdf:true).', 179 | inputSchema: { 180 | type: 'object', 181 | properties: { 182 | url: { 183 | type: 'string', 184 | description: 'The URL to convert to PDF', 185 | }, 186 | }, 187 | required: ['url'], 188 | }, 189 | }, 190 | { 191 | name: 'execute_js', 192 | description: 193 | '[STATELESS] Execute JavaScript and get return values + page content. Creates new browser each time. Use for: extracting data, triggering dynamic content, checking page state. Scripts with "return" statements return actual values (strings, numbers, objects, arrays). Note: null returns as {"success": true}. Returns values but page state is lost. For persistent JS execution, use crawl with session_id.', 194 | inputSchema: { 195 | type: 'object', 196 | properties: { 197 | url: { 198 | type: 'string', 199 | description: 'The URL to load', 200 | }, 201 | scripts: { 202 | type: ['string', 'array'], 203 | items: { type: 'string' }, 204 | description: 205 | 'JavaScript to execute. Use "return" to get values back! Each string runs separately. Returns appear in results array. Examples: "return document.title", "return document.querySelectorAll(\'a\').length", "return {url: location.href, links: [...document.links].map(a => a.href)}". Use proper JS syntax: real quotes, no HTML entities.', 206 | }, 207 | }, 208 | required: ['url', 'scripts'], 209 | }, 210 | }, 211 | { 212 | name: 'batch_crawl', 213 | description: 214 | '[STATELESS] Crawl multiple URLs concurrently for efficiency. Use when: processing URL lists, comparing multiple pages, or bulk data extraction. Faster than sequential crawling. Max 5 concurrent by default. Each URL gets a fresh browser. Cannot maintain state between URLs. For persistent operations use create_session + crawl.', 215 | inputSchema: { 216 | type: 'object', 217 | properties: { 218 | urls: { 219 | type: 'array', 220 | items: { type: 'string' }, 221 | description: 'List of URLs to crawl', 222 | }, 223 | max_concurrent: { 224 | type: 'number', 225 | description: 226 | 'Parallel request limit. Higher = faster but more resource intensive. Adjust based on server capacity and rate limits', 227 | default: 5, 228 | }, 229 | remove_images: { 230 | type: 'boolean', 231 | description: 'Remove images from output by excluding img, picture, and svg tags', 232 | default: false, 233 | }, 234 | bypass_cache: { 235 | type: 'boolean', 236 | description: 'Bypass cache for all URLs', 237 | default: false, 238 | }, 239 | }, 240 | required: ['urls'], 241 | }, 242 | }, 243 | { 244 | name: 'smart_crawl', 245 | description: 246 | '[STATELESS] Auto-detect and handle different content types (HTML, sitemap, RSS, text). Use when: URL type is unknown, crawling feeds/sitemaps, or want automatic format handling. Adapts strategy based on content. Creates new browser each time. For persistent operations use create_session + crawl.', 247 | inputSchema: { 248 | type: 'object', 249 | properties: { 250 | url: { 251 | type: 'string', 252 | description: 'The URL to crawl intelligently', 253 | }, 254 | max_depth: { 255 | type: 'number', 256 | description: 'Maximum crawl depth for sitemaps', 257 | default: 2, 258 | }, 259 | follow_links: { 260 | type: 'boolean', 261 | description: 'For sitemaps/RSS: crawl found URLs (max 10). For HTML: no effect', 262 | default: false, 263 | }, 264 | bypass_cache: { 265 | type: 'boolean', 266 | description: 'Force fresh crawl', 267 | default: false, 268 | }, 269 | }, 270 | required: ['url'], 271 | }, 272 | }, 273 | { 274 | name: 'get_html', 275 | description: 276 | '[STATELESS] Get sanitized/processed HTML for inspection and automation planning. Use when: finding form fields/selectors, analyzing page structure before automation, building schemas. Returns cleaned HTML showing element names, IDs, and classes - perfect for identifying selectors for subsequent crawl operations. Commonly used before crawl to find selectors for automation. Creates new browser each time.', 277 | inputSchema: { 278 | type: 'object', 279 | properties: { 280 | url: { 281 | type: 'string', 282 | description: 'The URL to extract HTML from', 283 | }, 284 | }, 285 | required: ['url'], 286 | }, 287 | }, 288 | { 289 | name: 'extract_links', 290 | description: 291 | '[STATELESS] Extract and categorize all page links. Use when: building sitemaps, analyzing site structure, finding broken links, or discovering resources. Groups by internal/external/social/documents. Creates new browser each time. For persistent operations use create_session + crawl.', 292 | inputSchema: { 293 | type: 'object', 294 | properties: { 295 | url: { 296 | type: 'string', 297 | description: 'The URL to extract links from', 298 | }, 299 | categorize: { 300 | type: 'boolean', 301 | description: 302 | 'Group links by type: internal (same domain), external, social media, documents (PDF/DOC), images. Helpful for link analysis', 303 | default: true, 304 | }, 305 | }, 306 | required: ['url'], 307 | }, 308 | }, 309 | { 310 | name: 'crawl_recursive', 311 | description: 312 | '[STATELESS] Deep crawl a website following internal links. Use when: mapping entire sites, finding all pages, building comprehensive indexes. Control with max_depth (default 3) and max_pages (default 50). Note: May need JS execution for dynamic sites. Each page gets a fresh browser. For persistent operations use create_session + crawl.', 313 | inputSchema: { 314 | type: 'object', 315 | properties: { 316 | url: { 317 | type: 'string', 318 | description: 'Starting URL to crawl from', 319 | }, 320 | max_depth: { 321 | type: 'number', 322 | description: 'Maximum depth to follow links', 323 | default: 3, 324 | }, 325 | max_pages: { 326 | type: 'number', 327 | description: 'Maximum number of pages to crawl', 328 | default: 50, 329 | }, 330 | include_pattern: { 331 | type: 'string', 332 | description: 333 | 'Regex to match URLs to crawl. Example: ".*\\/blog\\/.*" for blog posts only, ".*\\.html$" for HTML pages', 334 | }, 335 | exclude_pattern: { 336 | type: 'string', 337 | description: 338 | 'Regex to skip URLs. Example: ".*\\/(login|admin).*" to avoid auth pages, ".*\\.pdf$" to skip PDFs', 339 | }, 340 | }, 341 | required: ['url'], 342 | }, 343 | }, 344 | { 345 | name: 'parse_sitemap', 346 | description: 347 | '[STATELESS] Extract URLs from XML sitemaps. Use when: discovering all site pages, planning crawl strategies, or checking sitemap validity. Supports regex filtering. Try sitemap.xml or robots.txt first. Creates new browser each time.', 348 | inputSchema: { 349 | type: 'object', 350 | properties: { 351 | url: { 352 | type: 'string', 353 | description: 'URL of the sitemap (e.g., https://example.com/sitemap.xml)', 354 | }, 355 | filter_pattern: { 356 | type: 'string', 357 | description: 'Optional regex pattern to filter URLs', 358 | }, 359 | }, 360 | required: ['url'], 361 | }, 362 | }, 363 | { 364 | name: 'crawl', 365 | description: 366 | '[SUPPORTS SESSIONS] THE ONLY TOOL WITH BROWSER PERSISTENCE\n\n' + 367 | 'RECOMMENDED PATTERNS:\n' + 368 | '• Inspect first workflow:\n' + 369 | ' 1) get_html(url) → find selectors & verify elements exist\n' + 370 | ' 2) create_session() → "session-123"\n' + 371 | ' 3) crawl({url, session_id: "session-123", js_code: ["action 1"]})\n' + 372 | ' 4) crawl({url: "/page2", session_id: "session-123", js_code: ["action 2"]})\n\n' + 373 | '• Multi-step with state:\n' + 374 | ' 1) create_session() → "session-123"\n' + 375 | ' 2) crawl({url, session_id: "session-123"}) → inspect current state\n' + 376 | ' 3) crawl({url, session_id: "session-123", js_code: ["verified actions"]})\n\n' + 377 | 'WITH session_id: Maintains browser state (cookies, localStorage, page) across calls\n' + 378 | 'WITHOUT session_id: Creates fresh browser each time (like other tools)\n\n' + 379 | 'WHEN TO USE SESSIONS vs STATELESS:\n' + 380 | '• Need state between calls? → create_session + crawl\n' + 381 | '• Just extracting data? → Use stateless tools\n' + 382 | '• Filling forms? → Inspect first, then use sessions\n' + 383 | '• Taking screenshot after JS? → Must use crawl with session\n' + 384 | '• Unsure if elements exist? → Always use get_html first\n\n' + 385 | 'CRITICAL FOR js_code:\n' + 386 | 'RECOMMENDED: Always use screenshot: true when running js_code\n' + 387 | 'This avoids server serialization errors and gives visual confirmation', 388 | inputSchema: { 389 | type: 'object', 390 | properties: { 391 | url: { 392 | type: 'string', 393 | description: 'The URL to crawl', 394 | }, 395 | session_id: { 396 | type: 'string', 397 | description: 398 | 'ENABLES PERSISTENCE: Use SAME ID across all crawl calls to maintain browser state.\n' + 399 | '• First call with ID: Creates persistent browser\n' + 400 | '• Subsequent calls with SAME ID: Reuses browser with all state intact\n' + 401 | '• Different/no ID: Fresh browser (stateless)\n' + 402 | 'WARNING: ONLY works with crawl tool - other tools ignore this parameter', 403 | }, 404 | 405 | // === CORE CONFIGURATION === 406 | browser_type: { 407 | type: 'string', 408 | enum: ['chromium', 'firefox', 'webkit'], 409 | description: 410 | 'Browser engine for crawling. Chromium offers best compatibility, Firefox for specific use cases, WebKit for Safari-like behavior', 411 | default: 'chromium', 412 | }, 413 | viewport_width: { 414 | type: 'number', 415 | description: 'Browser window width in pixels. Affects responsive layouts and content visibility', 416 | default: 1080, 417 | }, 418 | viewport_height: { 419 | type: 'number', 420 | description: 'Browser window height in pixels. Impacts content loading and screenshot dimensions', 421 | default: 600, 422 | }, 423 | user_agent: { 424 | type: 'string', 425 | description: 426 | 'Custom browser identity. Use for: mobile sites (include "Mobile"), avoiding bot detection, or specific browser requirements. Example: "Mozilla/5.0 (iPhone...)"', 427 | }, 428 | proxy_server: { 429 | type: 'string', 430 | description: 'Proxy server URL (e.g., "http://proxy.example.com:8080")', 431 | }, 432 | proxy_username: { 433 | type: 'string', 434 | description: 'Proxy authentication username', 435 | }, 436 | proxy_password: { 437 | type: 'string', 438 | description: 'Proxy authentication password', 439 | }, 440 | cookies: { 441 | type: 'array', 442 | items: { 443 | type: 'object', 444 | properties: { 445 | name: { type: 'string', description: 'Cookie name' }, 446 | value: { type: 'string', description: 'Cookie value' }, 447 | domain: { type: 'string', description: 'Domain where cookie is valid' }, 448 | path: { type: 'string', description: 'URL path scope for cookie' }, 449 | }, 450 | required: ['name', 'value', 'domain'], 451 | }, 452 | description: 'Pre-set cookies for authentication or personalization', 453 | }, 454 | headers: { 455 | type: 'object', 456 | description: 'Custom HTTP headers for API keys, auth tokens, or specific server requirements', 457 | }, 458 | 459 | // === CONTENT PROCESSING === 460 | word_count_threshold: { 461 | type: 'number', 462 | description: 463 | 'Min words per text block. Filters out menus, footers, and short snippets. Lower = more content but more noise. Higher = only substantial paragraphs', 464 | default: 200, 465 | }, 466 | excluded_tags: { 467 | type: 'array', 468 | items: { type: 'string' }, 469 | description: 470 | 'HTML tags to remove completely. Common: ["nav", "footer", "aside", "script", "style"]. Cleans up content before extraction', 471 | }, 472 | remove_overlay_elements: { 473 | type: 'boolean', 474 | description: 'Automatically remove popups, modals, and overlays that obscure content', 475 | default: false, 476 | }, 477 | js_code: { 478 | type: ['string', 'array'], 479 | items: { type: 'string' }, 480 | description: 481 | 'JavaScript to execute. Each string runs separately. Use return to get values.\n\n' + 482 | 'IMPORTANT: Always verify elements exist before acting on them!\n' + 483 | 'Use get_html first to find correct selectors, then:\n' + 484 | 'GOOD: ["if (document.querySelector(\'input[name=\\"email\\"]\')) { ... }"]\n' + 485 | 'BAD: ["document.querySelector(\'input[name=\\"email\\"]\').value = \'...\'"]\n\n' + 486 | 'USAGE PATTERNS:\n' + 487 | '1. WITH screenshot/pdf: {js_code: [...], screenshot: true} ✓\n' + 488 | '2. MULTI-STEP: First {js_code: [...], session_id: "x"}, then {js_only: true, session_id: "x"}\n' + 489 | '3. AVOID: {js_code: [...], js_only: true} on first call ✗\n\n' + 490 | 'SELECTOR TIPS: Use get_html first to find:\n' + 491 | ' • name="..." (best for forms)\n' + 492 | ' • id="..." (if unique)\n' + 493 | ' • class="..." (careful, may repeat)\n\n' + 494 | 'FORM EXAMPLE WITH VERIFICATION: [\n' + 495 | ' "const emailInput = document.querySelector(\'input[name=\\"email\\"]\');",\n' + 496 | ' "if (emailInput) emailInput.value = \'[email protected]\';",\n' + 497 | ' "const submitBtn = document.querySelector(\'button[type=\\"submit\\"]\');",\n' + 498 | ' "if (submitBtn) submitBtn.click();"\n' + 499 | ']', 500 | }, 501 | js_only: { 502 | type: 'boolean', 503 | description: 504 | 'FOR SUBSEQUENT CALLS ONLY: Reuse existing session without navigation\n' + 505 | 'First call: Use js_code WITHOUT js_only (or with screenshot/pdf)\n' + 506 | 'Later calls: Use js_only=true to run more JS in same session\n' + 507 | 'ERROR: Using js_only=true on first call causes server errors', 508 | default: false, 509 | }, 510 | wait_for: { 511 | type: 'string', 512 | description: 513 | 'Wait for element that loads AFTER initial page load. Format: "css:.selector" or "js:() => condition"\n\n' + 514 | 'WHEN TO USE:\n' + 515 | ' • Dynamic content that loads after page (AJAX, lazy load)\n' + 516 | ' • Elements that appear after animations/transitions\n' + 517 | ' • Content loaded by JavaScript frameworks\n\n' + 518 | 'WHEN NOT TO USE:\n' + 519 | ' • Elements already in initial HTML (forms, static content)\n' + 520 | ' • Standard page elements (just use wait_until: "load")\n' + 521 | ' • Can cause timeouts/errors if element already exists!\n\n' + 522 | 'SELECTOR TIPS: Use get_html first to check if element exists\n' + 523 | 'Examples: "css:.ajax-content", "js:() => document.querySelector(\'.lazy-loaded\')"', 524 | }, 525 | wait_for_timeout: { 526 | type: 'number', 527 | description: 'Maximum milliseconds to wait for condition', 528 | default: 30000, 529 | }, 530 | delay_before_scroll: { 531 | type: 'number', 532 | description: 'Milliseconds to wait before scrolling. Allows initial content to render', 533 | default: 1000, 534 | }, 535 | scroll_delay: { 536 | type: 'number', 537 | description: 'Milliseconds between scroll steps for lazy-loaded content', 538 | default: 500, 539 | }, 540 | process_iframes: { 541 | type: 'boolean', 542 | description: 'Extract content from embedded iframes including videos and forms', 543 | default: false, 544 | }, 545 | exclude_external_links: { 546 | type: 'boolean', 547 | description: 'Remove links pointing to different domains for cleaner content', 548 | default: false, 549 | }, 550 | screenshot: { 551 | type: 'boolean', 552 | description: 'Capture full-page screenshot as base64 PNG', 553 | default: false, 554 | }, 555 | screenshot_directory: { 556 | type: 'string', 557 | description: 558 | "Directory path to save screenshot (e.g., ~/Desktop, /tmp). Do NOT include filename - it will be auto-generated. Large screenshots (>800KB) won't be returned inline when saved.", 559 | }, 560 | pdf: { 561 | type: 'boolean', 562 | description: 'Generate PDF as base64 preserving exact layout', 563 | default: false, 564 | }, 565 | cache_mode: { 566 | type: 'string', 567 | enum: ['ENABLED', 'BYPASS', 'DISABLED'], 568 | description: 569 | 'Cache strategy. ENABLED: Use cache if available. BYPASS: Fetch fresh (recommended). DISABLED: No cache', 570 | default: 'BYPASS', 571 | }, 572 | timeout: { 573 | type: 'number', 574 | description: 'Overall request timeout in milliseconds', 575 | default: 60000, 576 | }, 577 | verbose: { 578 | type: 'boolean', 579 | description: 580 | 'Enable server-side debug logging (not shown in output). Only for troubleshooting. Does not affect extraction results', 581 | default: false, 582 | }, 583 | 584 | // === DYNAMIC CONTENT HANDLING === 585 | wait_until: { 586 | type: 'string', 587 | enum: ['domcontentloaded', 'networkidle', 'load'], 588 | description: 589 | 'When to consider page loaded (use INSTEAD of wait_for for initial load):\n' + 590 | '• "domcontentloaded" (default): Fast, DOM ready, use for forms/static content\n' + 591 | '• "load": All resources loaded, use if you need images\n' + 592 | '• "networkidle": Wait for network quiet, use for heavy JS apps\n' + 593 | "WARNING: Don't use wait_for for elements in initial HTML!", 594 | default: 'domcontentloaded', 595 | }, 596 | page_timeout: { 597 | type: 'number', 598 | description: 'Page navigation timeout in milliseconds', 599 | default: 60000, 600 | }, 601 | wait_for_images: { 602 | type: 'boolean', 603 | description: 'Wait for all images to load before extraction', 604 | default: false, 605 | }, 606 | ignore_body_visibility: { 607 | type: 'boolean', 608 | description: 'Skip checking if body element is visible', 609 | default: true, 610 | }, 611 | scan_full_page: { 612 | type: 'boolean', 613 | description: 614 | 'Auto-scroll entire page to trigger lazy loading. WARNING: Can be slow on long pages. Avoid combining with wait_until:"networkidle" or CSS extraction on dynamic sites. Better to use virtual_scroll_config for infinite feeds', 615 | default: false, 616 | }, 617 | remove_forms: { 618 | type: 'boolean', 619 | description: 'Remove all form elements from extracted content', 620 | default: false, 621 | }, 622 | keep_data_attributes: { 623 | type: 'boolean', 624 | description: 'Preserve data-* attributes in cleaned HTML', 625 | default: false, 626 | }, 627 | excluded_selector: { 628 | type: 'string', 629 | description: 630 | 'CSS selector for elements to remove. Comma-separate multiple selectors.\n\n' + 631 | 'SELECTOR STRATEGY: Use get_html first to inspect page structure. Look for:\n' + 632 | ' • id attributes (e.g., #cookie-banner)\n' + 633 | ' • CSS classes (e.g., .advertisement, .popup)\n' + 634 | ' • data-* attributes (e.g., [data-type="ad"])\n' + 635 | ' • Element type + attributes (e.g., div[role="banner"])\n\n' + 636 | 'Examples: "#cookie-banner, .advertisement, .social-share"', 637 | }, 638 | only_text: { 639 | type: 'boolean', 640 | description: 'Extract only text content, no HTML structure', 641 | default: false, 642 | }, 643 | 644 | // === OUTPUT OPTIONS === 645 | image_description_min_word_threshold: { 646 | type: 'number', 647 | description: 'Minimum words for image alt text to be considered valid', 648 | default: 50, 649 | }, 650 | image_score_threshold: { 651 | type: 'number', 652 | description: 'Minimum relevance score for images (filters low-quality images)', 653 | default: 3, 654 | }, 655 | exclude_external_images: { 656 | type: 'boolean', 657 | description: 'Exclude images from external domains', 658 | default: false, 659 | }, 660 | screenshot_wait_for: { 661 | type: 'number', 662 | description: 'Extra wait time in seconds before taking screenshot', 663 | }, 664 | 665 | // === LINK & DOMAIN FILTERING === 666 | exclude_social_media_links: { 667 | type: 'boolean', 668 | description: 'Remove links to social media platforms', 669 | default: false, 670 | }, 671 | exclude_domains: { 672 | type: 'array', 673 | items: { type: 'string' }, 674 | description: 'List of domains to exclude from links (e.g., ["ads.com", "tracker.io"])', 675 | }, 676 | 677 | // === PERFORMANCE & ANTI-BOT === 678 | simulate_user: { 679 | type: 'boolean', 680 | description: 681 | 'Mimic human behavior with random mouse movements and delays. Helps bypass bot detection on protected sites. Slows crawling but improves success rate', 682 | default: false, 683 | }, 684 | override_navigator: { 685 | type: 'boolean', 686 | description: 'Override navigator properties for stealth', 687 | default: false, 688 | }, 689 | magic: { 690 | type: 'boolean', 691 | description: 692 | 'EXPERIMENTAL: Auto-handles popups, cookies, overlays.\n' + 693 | 'Use as LAST RESORT - can conflict with wait_for & CSS extraction\n' + 694 | 'Try first: remove_overlay_elements, excluded_selector\n' + 695 | 'Avoid with: CSS extraction, precise timing needs', 696 | default: false, 697 | }, 698 | 699 | // Virtual Scroll Configuration 700 | virtual_scroll_config: { 701 | type: 'object', 702 | description: 703 | 'For infinite scroll sites that REPLACE content (Twitter/Instagram feeds).\n' + 704 | 'USE when: Content disappears as you scroll (virtual scrolling)\n' + 705 | "DON'T USE when: Content appends (use scan_full_page instead)\n" + 706 | 'Example: {container_selector: "#timeline", scroll_count: 10, wait_after_scroll: 1}', 707 | properties: { 708 | container_selector: { 709 | type: 'string', 710 | description: 711 | 'CSS selector for the scrollable container.\n\n' + 712 | 'SELECTOR STRATEGY: Use get_html first to inspect page structure. Look for:\n' + 713 | ' • id attributes (e.g., #timeline)\n' + 714 | ' • role attributes (e.g., [role="feed"])\n' + 715 | ' • CSS classes (e.g., .feed, .timeline)\n' + 716 | ' • data-* attributes (e.g., [data-testid="primaryColumn"])\n\n' + 717 | 'Common: "#timeline" (Twitter), "[role=\'feed\']" (generic), ".feed" (Instagram)', 718 | }, 719 | scroll_count: { 720 | type: 'number', 721 | description: 722 | 'How many times to scroll. Each scroll loads new content batch. More = more posts but slower', 723 | default: 10, 724 | }, 725 | scroll_by: { 726 | type: ['string', 'number'], 727 | description: 728 | 'Distance per scroll. "container_height": one viewport, "page_height": full page, or pixels like 500', 729 | default: 'container_height', 730 | }, 731 | wait_after_scroll: { 732 | type: 'number', 733 | description: 'Seconds to wait after each scroll', 734 | default: 0.5, 735 | }, 736 | }, 737 | required: ['container_selector'], 738 | }, 739 | 740 | // Other 741 | log_console: { 742 | type: 'boolean', 743 | description: 'Capture browser console logs for debugging', 744 | default: false, 745 | }, 746 | }, 747 | required: ['url'], 748 | }, 749 | }, 750 | { 751 | name: 'manage_session', 752 | description: 753 | '[SESSION MANAGEMENT] Unified tool for managing browser sessions. Supports three actions:\n\n' + 754 | '• CREATE: Start a persistent browser session that maintains state across calls\n' + 755 | '• CLEAR: Remove a session from local tracking\n' + 756 | '• LIST: Show all active sessions with age and usage info\n\n' + 757 | 'USAGE EXAMPLES:\n' + 758 | '1. Create session: {action: "create", session_id: "my-session", initial_url: "https://example.com"}\n' + 759 | '2. Clear session: {action: "clear", session_id: "my-session"}\n' + 760 | '3. List sessions: {action: "list"}\n\n' + 761 | 'Browser sessions maintain ALL state (cookies, localStorage, page) across multiple crawl calls. Essential for: forms, login flows, multi-step processes, maintaining state across operations.', 762 | inputSchema: { 763 | // Anthropic/Claude tools require top-level schemas to be a plain object without oneOf/allOf/anyOf 764 | type: 'object', 765 | properties: { 766 | action: { 767 | type: 'string', 768 | description: 'Action to perform: create, clear, or list', 769 | enum: ['create', 'clear', 'list'], 770 | }, 771 | session_id: { 772 | type: 'string', 773 | description: 774 | 'Session identifier. Required for action="clear". Optional for create (auto-generated if omitted).', 775 | }, 776 | initial_url: { 777 | type: 'string', 778 | description: 'URL to load when creating session (action="create").', 779 | }, 780 | browser_type: { 781 | type: 'string', 782 | enum: ['chromium', 'firefox', 'webkit'], 783 | description: 'Browser engine for the session (action="create").', 784 | default: 'chromium', 785 | }, 786 | }, 787 | required: ['action'], 788 | }, 789 | }, 790 | { 791 | name: 'extract_with_llm', 792 | description: 793 | '[STATELESS] Ask questions about webpage content using AI. Returns natural language answers. ' + 794 | 'Crawls fresh each time. For dynamic content or sessions, use crawl with session_id first.', 795 | inputSchema: { 796 | type: 'object', 797 | properties: { 798 | url: { 799 | type: 'string', 800 | description: 'The URL to extract data from', 801 | }, 802 | query: { 803 | type: 'string', 804 | description: 805 | 'Your question about the webpage content. Examples: "What is the main topic?", ' + 806 | '"List all product prices", "Summarize the key points", "What contact information is available?"', 807 | }, 808 | }, 809 | required: ['url', 'query'], 810 | }, 811 | }, 812 | ], 813 | })); 814 | 815 | // Handle tool calls 816 | this.server.setRequestHandler(CallToolRequestSchema, async (request) => { 817 | const { name, arguments: args } = request.params; 818 | 819 | try { 820 | switch (name) { 821 | case 'get_markdown': 822 | return await this.validateAndExecute( 823 | 'get_markdown', 824 | args, 825 | GetMarkdownSchema as z.ZodSchema<z.infer<typeof GetMarkdownSchema>>, 826 | async (validatedArgs) => this.contentHandlers.getMarkdown(validatedArgs), 827 | ); 828 | 829 | case 'capture_screenshot': 830 | return await this.validateAndExecute( 831 | 'capture_screenshot', 832 | args, 833 | CaptureScreenshotSchema, 834 | async (validatedArgs) => this.contentHandlers.captureScreenshot(validatedArgs), 835 | ); 836 | 837 | case 'generate_pdf': 838 | return await this.validateAndExecute('generate_pdf', args, GeneratePdfSchema, async (validatedArgs) => 839 | this.contentHandlers.generatePDF(validatedArgs), 840 | ); 841 | 842 | case 'execute_js': 843 | return await this.validateAndExecute('execute_js', args, ExecuteJsSchema, async (validatedArgs) => 844 | this.utilityHandlers.executeJS(validatedArgs), 845 | ); 846 | 847 | case 'batch_crawl': 848 | return await this.validateAndExecute('batch_crawl', args, BatchCrawlSchema, async (validatedArgs) => 849 | this.crawlHandlers.batchCrawl(validatedArgs as BatchCrawlOptions), 850 | ); 851 | 852 | case 'smart_crawl': 853 | return await this.validateAndExecute('smart_crawl', args, SmartCrawlSchema, async (validatedArgs) => 854 | this.crawlHandlers.smartCrawl(validatedArgs), 855 | ); 856 | 857 | case 'get_html': 858 | return await this.validateAndExecute('get_html', args, GetHtmlSchema, async (validatedArgs) => 859 | this.contentHandlers.getHTML(validatedArgs), 860 | ); 861 | 862 | case 'extract_links': 863 | return await this.validateAndExecute( 864 | 'extract_links', 865 | args, 866 | ExtractLinksSchema as z.ZodSchema<z.infer<typeof ExtractLinksSchema>>, 867 | async (validatedArgs) => this.utilityHandlers.extractLinks(validatedArgs), 868 | ); 869 | 870 | case 'crawl_recursive': 871 | return await this.validateAndExecute('crawl_recursive', args, CrawlRecursiveSchema, async (validatedArgs) => 872 | this.crawlHandlers.crawlRecursive(validatedArgs), 873 | ); 874 | 875 | case 'parse_sitemap': 876 | return await this.validateAndExecute('parse_sitemap', args, ParseSitemapSchema, async (validatedArgs) => 877 | this.crawlHandlers.parseSitemap(validatedArgs), 878 | ); 879 | 880 | case 'crawl': 881 | return await this.validateAndExecute('crawl', args, CrawlSchema, async (validatedArgs) => 882 | this.crawlHandlers.crawl(validatedArgs), 883 | ); 884 | 885 | case 'manage_session': 886 | return await this.validateAndExecute('manage_session', args, ManageSessionSchema, async (validatedArgs) => 887 | this.sessionHandlers.manageSession(validatedArgs), 888 | ); 889 | 890 | case 'extract_with_llm': 891 | return await this.validateAndExecute( 892 | 'extract_with_llm', 893 | args, 894 | ExtractWithLlmSchema, 895 | async (validatedArgs) => this.contentHandlers.extractWithLLM(validatedArgs), 896 | ); 897 | 898 | default: 899 | throw new Error(`Unknown tool: ${name}`); 900 | } 901 | } catch (error) { 902 | return { 903 | content: [ 904 | { 905 | type: 'text', 906 | text: `Error: ${error instanceof Error ? error.message : String(error)}`, 907 | }, 908 | ], 909 | }; 910 | } 911 | }); 912 | } 913 | 914 | // Expose handler methods for testing 915 | protected async getMarkdown(options: Parameters<ContentHandlers['getMarkdown']>[0]) { 916 | return this.contentHandlers.getMarkdown(options); 917 | } 918 | 919 | protected async captureScreenshot(options: Parameters<ContentHandlers['captureScreenshot']>[0]) { 920 | return this.contentHandlers.captureScreenshot(options); 921 | } 922 | 923 | protected async generatePDF(options: Parameters<ContentHandlers['generatePDF']>[0]) { 924 | return this.contentHandlers.generatePDF(options); 925 | } 926 | 927 | protected async getHTML(options: Parameters<ContentHandlers['getHTML']>[0]) { 928 | return this.contentHandlers.getHTML(options); 929 | } 930 | 931 | protected async extractWithLLM(options: Parameters<ContentHandlers['extractWithLLM']>[0]) { 932 | return this.contentHandlers.extractWithLLM(options); 933 | } 934 | 935 | protected async executeJS(options: Parameters<UtilityHandlers['executeJS']>[0]) { 936 | return this.utilityHandlers.executeJS(options); 937 | } 938 | 939 | protected async extractLinks(options: Parameters<UtilityHandlers['extractLinks']>[0]) { 940 | return this.utilityHandlers.extractLinks(options); 941 | } 942 | 943 | protected async batchCrawl(options: Parameters<CrawlHandlers['batchCrawl']>[0]) { 944 | return this.crawlHandlers.batchCrawl(options); 945 | } 946 | 947 | protected async smartCrawl(options: Parameters<CrawlHandlers['smartCrawl']>[0]) { 948 | return this.crawlHandlers.smartCrawl(options); 949 | } 950 | 951 | protected async crawlRecursive(options: Parameters<CrawlHandlers['crawlRecursive']>[0]) { 952 | return this.crawlHandlers.crawlRecursive(options); 953 | } 954 | 955 | protected async parseSitemap(options: Parameters<CrawlHandlers['parseSitemap']>[0]) { 956 | return this.crawlHandlers.parseSitemap(options); 957 | } 958 | 959 | protected async crawl(options: Parameters<CrawlHandlers['crawl']>[0]) { 960 | return this.crawlHandlers.crawl(options); 961 | } 962 | 963 | // Setter for axiosClient to update all handlers (for testing) 964 | set axiosClientForTesting(client: AxiosInstance) { 965 | this.axiosClient = client; 966 | // Re-initialize handlers with new client 967 | this.contentHandlers = new ContentHandlers(this.service, client, this.sessions); 968 | this.sessionHandlers = new SessionHandlers(this.service, client, this.sessions); 969 | this.utilityHandlers = new UtilityHandlers(this.service, client, this.sessions); 970 | this.crawlHandlers = new CrawlHandlers(this.service, client, this.sessions); 971 | } 972 | 973 | /* istanbul ignore next */ 974 | async start() { 975 | const transport = new StdioServerTransport(); 976 | await this.server.connect(transport); 977 | console.error(`${this.serverName} v${this.serverVersion} started`); 978 | } 979 | } 980 | ```