This is page 2 of 4. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .env.example ├── .github │ ├── CI.md │ ├── copilot-instructions.md │ └── workflows │ └── ci.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── CHANGELOG.md ├── eslint.config.mjs ├── jest.config.cjs ├── jest.setup.cjs ├── LICENSE ├── package-lock.json ├── package.json ├── README.md ├── src │ ├── __tests__ │ │ ├── crawl.test.ts │ │ ├── crawl4ai-service.network.test.ts │ │ ├── crawl4ai-service.test.ts │ │ ├── handlers │ │ │ ├── crawl-handlers.test.ts │ │ │ ├── parameter-combinations.test.ts │ │ │ ├── screenshot-saving.test.ts │ │ │ ├── session-handlers.test.ts │ │ │ └── utility-handlers.test.ts │ │ ├── index.cli.test.ts │ │ ├── index.npx.test.ts │ │ ├── index.server.test.ts │ │ ├── index.test.ts │ │ ├── integration │ │ │ ├── batch-crawl.integration.test.ts │ │ │ ├── capture-screenshot.integration.test.ts │ │ │ ├── crawl-advanced.integration.test.ts │ │ │ ├── crawl-handlers.integration.test.ts │ │ │ ├── crawl-recursive.integration.test.ts │ │ │ ├── crawl.integration.test.ts │ │ │ ├── execute-js.integration.test.ts │ │ │ ├── extract-links.integration.test.ts │ │ │ ├── extract-with-llm.integration.test.ts │ │ │ ├── generate-pdf.integration.test.ts │ │ │ ├── get-html.integration.test.ts │ │ │ ├── get-markdown.integration.test.ts │ │ │ ├── parse-sitemap.integration.test.ts │ │ │ ├── session-management.integration.test.ts │ │ │ ├── smart-crawl.integration.test.ts │ │ │ └── test-utils.ts │ │ ├── request-handler.test.ts │ │ ├── schemas │ │ │ └── validation-edge-cases.test.ts │ │ ├── types │ │ │ └── mocks.ts │ │ └── utils │ │ └── javascript-validation.test.ts │ ├── crawl4ai-service.ts │ ├── handlers │ │ ├── base-handler.ts │ │ ├── content-handlers.ts │ │ ├── crawl-handlers.ts │ │ ├── session-handlers.ts │ │ └── utility-handlers.ts │ ├── index.ts │ ├── schemas │ │ ├── helpers.ts │ │ └── validation-schemas.ts │ ├── server.ts │ └── types.ts ├── tsconfig.build.json └── tsconfig.json ``` # Files -------------------------------------------------------------------------------- /src/__tests__/index.npx.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { spawn } from 'child_process'; 2 | import * as path from 'path'; 3 | import * as url from 'url'; 4 | import * as fs from 'fs/promises'; 5 | 6 | const __dirname = url.fileURLToPath(new URL('.', import.meta.url)); 7 | 8 | describe('NPX Execution Tests', () => { 9 | // These tests ensure the package works when installed and run via npx 10 | // This prevents issues like the one in v2.6.11 where the server wouldn't start 11 | 12 | describe('Simulated NPX execution', () => { 13 | it('should start server when run from dist/index.js directly', async () => { 14 | // This simulates how npx runs the built package 15 | const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); 16 | 17 | // Check if dist/index.js exists (it should after build) 18 | try { 19 | await fs.access(distIndexPath); 20 | } catch { 21 | console.warn('Skipping test - dist/index.js not found. Run "npm run build" first.'); 22 | return; 23 | } 24 | 25 | const child = spawn('node', [distIndexPath], { 26 | env: { 27 | ...process.env, 28 | CRAWL4AI_BASE_URL: 'http://localhost:11235', 29 | CRAWL4AI_API_KEY: 'test-key', 30 | // Don't load .env file to simulate production 31 | NODE_ENV: 'production', 32 | }, 33 | stdio: 'pipe', 34 | }); 35 | 36 | let stderr = ''; 37 | child.stderr.on('data', (data) => { 38 | stderr += data.toString(); 39 | }); 40 | 41 | // Wait for server to start 42 | await new Promise<void>((resolve) => { 43 | const timeout = setTimeout(() => { 44 | child.kill(); 45 | resolve(); 46 | }, 2000); 47 | 48 | child.stderr.on('data', (data) => { 49 | const output = data.toString(); 50 | if (output.includes('started')) { 51 | clearTimeout(timeout); 52 | child.kill(); 53 | resolve(); 54 | } 55 | }); 56 | }); 57 | 58 | // Server should have started successfully 59 | expect(stderr).toContain('crawl4ai-mcp'); 60 | expect(stderr).toContain('started'); 61 | }); 62 | 63 | it('should start server without dotenv when env vars are provided', async () => { 64 | // This tests that we don't require dotenv in production 65 | const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); 66 | 67 | try { 68 | await fs.access(distIndexPath); 69 | } catch { 70 | console.warn('Skipping test - dist/index.js not found.'); 71 | return; 72 | } 73 | 74 | // Temporarily rename node_modules/dotenv to simulate it not being available 75 | const dotenvPath = path.join(__dirname, '..', '..', 'node_modules', 'dotenv'); 76 | const dotenvBackupPath = path.join(__dirname, '..', '..', 'node_modules', 'dotenv.backup'); 77 | 78 | let dotenvRenamed = false; 79 | try { 80 | // Only rename if dotenv exists 81 | try { 82 | await fs.access(dotenvPath); 83 | await fs.rename(dotenvPath, dotenvBackupPath); 84 | dotenvRenamed = true; 85 | } catch { 86 | // dotenv doesn't exist, which is fine for this test 87 | } 88 | 89 | const child = spawn('node', [distIndexPath], { 90 | env: { 91 | CRAWL4AI_BASE_URL: 'http://localhost:11235', 92 | CRAWL4AI_API_KEY: 'test-key', 93 | PATH: process.env.PATH, 94 | }, 95 | stdio: 'pipe', 96 | }); 97 | 98 | let stderr = ''; 99 | child.stderr.on('data', (data) => { 100 | stderr += data.toString(); 101 | }); 102 | 103 | // Wait for server to start 104 | await new Promise<void>((resolve) => { 105 | setTimeout(() => { 106 | child.kill(); 107 | resolve(); 108 | }, 2000); 109 | }); 110 | 111 | // Server should still start even without dotenv 112 | expect(stderr).toContain('crawl4ai-mcp'); 113 | expect(stderr).toContain('started'); 114 | } finally { 115 | // Restore dotenv if we renamed it 116 | if (dotenvRenamed) { 117 | await fs.rename(dotenvBackupPath, dotenvPath); 118 | } 119 | } 120 | }); 121 | 122 | it('should handle MCP protocol initialization', async () => { 123 | // This simulates the full MCP handshake that Claude Desktop does 124 | const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); 125 | 126 | try { 127 | await fs.access(distIndexPath); 128 | } catch { 129 | console.warn('Skipping test - dist/index.js not found.'); 130 | return; 131 | } 132 | 133 | const child = spawn('node', [distIndexPath], { 134 | env: { 135 | ...process.env, 136 | CRAWL4AI_BASE_URL: 'http://localhost:11235', 137 | CRAWL4AI_API_KEY: 'test-key', 138 | }, 139 | stdio: 'pipe', 140 | }); 141 | 142 | let stdout = ''; 143 | let stderr = ''; 144 | 145 | child.stdout.on('data', (data) => { 146 | stdout += data.toString(); 147 | }); 148 | 149 | child.stderr.on('data', (data) => { 150 | stderr += data.toString(); 151 | }); 152 | 153 | // Wait for server to start 154 | await new Promise((resolve) => setTimeout(resolve, 500)); 155 | 156 | // Send MCP initialization request (like Claude Desktop does) 157 | const initRequest = 158 | JSON.stringify({ 159 | jsonrpc: '2.0', 160 | method: 'initialize', 161 | params: { 162 | protocolVersion: '2025-06-18', 163 | capabilities: {}, 164 | clientInfo: { 165 | name: 'test-client', 166 | version: '1.0.0', 167 | }, 168 | }, 169 | id: 1, 170 | }) + '\n'; 171 | 172 | child.stdin.write(initRequest); 173 | 174 | // Wait for response 175 | await new Promise((resolve) => setTimeout(resolve, 1000)); 176 | 177 | // Parse the response 178 | const response = stdout.trim().split('\n').pop(); 179 | let parsed; 180 | try { 181 | parsed = JSON.parse(response || '{}'); 182 | } catch { 183 | // Response might not be valid JSON yet 184 | parsed = {}; 185 | } 186 | 187 | child.kill(); 188 | 189 | // Should have received an initialization response 190 | expect(stderr).toContain('started'); 191 | expect(parsed.id).toBe(1); 192 | expect(parsed.result).toBeDefined(); 193 | }); 194 | 195 | it('should fail gracefully when CRAWL4AI_BASE_URL is missing', async () => { 196 | const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); 197 | 198 | try { 199 | await fs.access(distIndexPath); 200 | } catch { 201 | console.warn('Skipping test - dist/index.js not found.'); 202 | return; 203 | } 204 | 205 | const child = spawn('node', [distIndexPath], { 206 | env: { 207 | // Explicitly set to empty string to prevent dotenv from loading 208 | CRAWL4AI_BASE_URL: '', 209 | PATH: process.env.PATH, 210 | }, 211 | stdio: 'pipe', 212 | }); 213 | 214 | let stderr = ''; 215 | child.stderr.on('data', (data) => { 216 | stderr += data.toString(); 217 | }); 218 | 219 | const exitCode = await new Promise<number | null>((resolve, reject) => { 220 | // Add timeout to prevent hanging 221 | const timeout = setTimeout(() => { 222 | child.kill('SIGTERM'); 223 | reject(new Error('Process timeout')); 224 | }, 10000); // 10 second timeout 225 | 226 | child.on('exit', (code) => { 227 | clearTimeout(timeout); 228 | resolve(code); 229 | }); 230 | 231 | child.on('error', (err) => { 232 | clearTimeout(timeout); 233 | reject(err); 234 | }); 235 | }); 236 | 237 | // Should exit with error code 238 | expect(exitCode).toBe(1); 239 | expect(stderr).toContain('CRAWL4AI_BASE_URL environment variable is required'); 240 | 241 | // Ensure cleanup 242 | child.kill(); 243 | }, 15000); // 15 second test timeout 244 | }); 245 | 246 | describe('NPX-specific edge cases', () => { 247 | it('should work with different Node.js execution paths', async () => { 248 | // NPX might use different paths for node execution 249 | const distIndexPath = path.join(__dirname, '..', '..', 'dist', 'index.js'); 250 | 251 | try { 252 | await fs.access(distIndexPath); 253 | } catch { 254 | console.warn('Skipping test - dist/index.js not found.'); 255 | return; 256 | } 257 | 258 | // Test with different argv[1] values that npx might use 259 | const testPaths = [ 260 | distIndexPath, 261 | '/tmp/npx-12345/node_modules/.bin/mcp-crawl4ai-ts', 262 | path.join(process.env.HOME || '', '.npm/_npx/12345/node_modules/mcp-crawl4ai-ts/dist/index.js'), 263 | ]; 264 | 265 | for (const testPath of testPaths) { 266 | const child = spawn('node', [distIndexPath], { 267 | env: { 268 | ...process.env, 269 | CRAWL4AI_BASE_URL: 'http://localhost:11235', 270 | // Simulate different execution contexts 271 | npm_execpath: testPath, 272 | }, 273 | stdio: 'pipe', 274 | }); 275 | 276 | let started = false; 277 | child.stderr.on('data', (data) => { 278 | if (data.toString().includes('started')) { 279 | started = true; 280 | } 281 | }); 282 | 283 | // Give it time to start 284 | await new Promise((resolve) => setTimeout(resolve, 500)); 285 | child.kill(); 286 | 287 | expect(started).toBe(true); 288 | } 289 | }); 290 | }); 291 | }); 292 | ``` -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- ```typescript 1 | export interface CrawlOptions { 2 | remove_images?: boolean; 3 | bypass_cache?: boolean; 4 | filter_mode?: 'blacklist' | 'whitelist'; 5 | filter_list?: string[]; 6 | screenshot?: boolean; 7 | wait_for?: string; 8 | timeout?: number; 9 | } 10 | 11 | export interface JSExecuteOptions { 12 | js_code: string | string[]; 13 | // Only url and js_code (scripts) are supported by /execute_js endpoint 14 | } 15 | 16 | export interface JSExecuteEndpointOptions { 17 | url: string; 18 | scripts: string | string[]; 19 | // Only url and scripts are supported by /execute_js endpoint 20 | } 21 | 22 | export interface JSExecuteEndpointResponse { 23 | success: boolean; 24 | js_execution_result: { 25 | success: boolean; 26 | results: unknown[]; 27 | }; 28 | markdown?: string | CrawlMarkdownResult; 29 | } 30 | 31 | export interface ScreenshotEndpointOptions { 32 | url: string; 33 | screenshot_wait_for?: number; 34 | save_to_directory?: string; 35 | // output_path is omitted to get base64 response 36 | } 37 | 38 | export interface ScreenshotEndpointResponse { 39 | success: boolean; 40 | screenshot: string; // base64 encoded image 41 | } 42 | 43 | export interface PDFEndpointOptions { 44 | url: string; 45 | // Only url is supported by /pdf endpoint 46 | } 47 | 48 | export interface PDFEndpointResponse { 49 | success: boolean; 50 | pdf: string; // base64 encoded PDF 51 | } 52 | 53 | export interface HTMLEndpointOptions { 54 | url: string; 55 | // Only url is supported by /html endpoint 56 | } 57 | 58 | export interface HTMLEndpointResponse { 59 | html: string; 60 | url: string; 61 | success: boolean; 62 | } 63 | 64 | export type FilterType = 'raw' | 'fit' | 'bm25' | 'llm'; 65 | 66 | export interface MarkdownEndpointOptions { 67 | url: string; 68 | f?: FilterType; // Filter type: raw, fit (default), bm25, llm 69 | q?: string; // Query string for bm25/llm filters 70 | c?: string; // Cache-bust parameter 71 | } 72 | 73 | export interface MarkdownEndpointResponse { 74 | url: string; 75 | filter: string; 76 | query: string | null; 77 | cache: string; 78 | markdown: string; 79 | success: boolean; 80 | } 81 | 82 | export interface LLMEndpointOptions { 83 | url: string; 84 | query: string; 85 | } 86 | 87 | export interface LLMEndpointResponse { 88 | answer: string; 89 | } 90 | 91 | export interface BatchCrawlOptions extends CrawlOptions { 92 | urls: string[]; 93 | max_concurrent?: number; 94 | // New: Support per-URL configs array (0.7.3/0.7.4) 95 | configs?: Array<{ 96 | url: string; 97 | browser_config?: BrowserConfig; 98 | crawler_config?: CrawlerConfig; 99 | extraction_strategy?: ExtractionStrategy; 100 | table_extraction_strategy?: TableExtractionStrategy; 101 | markdown_generator_options?: MarkdownGeneratorOptions; 102 | matcher?: string | ((url: string) => boolean); 103 | }>; 104 | } 105 | 106 | // Browser configuration options 107 | export interface BrowserConfig { 108 | browser_type?: 'chromium' | 'firefox' | 'webkit' | 'undetected'; 109 | headless?: boolean; 110 | viewport_width?: number; 111 | viewport_height?: number; 112 | user_agent?: string; 113 | // Unified proxy config - accepts string or object format (new in 0.7.3/0.7.4) 114 | proxy?: 115 | | string 116 | | { 117 | server: string; 118 | username?: string; 119 | password?: string; 120 | }; 121 | // Legacy field kept for backward compatibility 122 | proxy_config?: { 123 | server: string; 124 | username?: string; 125 | password?: string; 126 | }; 127 | cookies?: Array<{ 128 | name: string; 129 | value: string; 130 | domain: string; 131 | path?: string; 132 | }>; 133 | headers?: Record<string, string>; 134 | extra_args?: string[]; 135 | } 136 | 137 | // Virtual scroll configuration for sites like Twitter/Instagram 138 | export interface VirtualScrollConfig { 139 | container_selector: string; 140 | scroll_count?: number; 141 | scroll_by?: string | number; 142 | wait_after_scroll?: number; 143 | } 144 | 145 | // Crawler configuration options 146 | export interface CrawlerConfig { 147 | // Content filtering 148 | word_count_threshold?: number; 149 | excluded_tags?: string[]; 150 | excluded_selector?: string; 151 | remove_overlay_elements?: boolean; 152 | only_text?: boolean; 153 | remove_forms?: boolean; 154 | keep_data_attributes?: boolean; 155 | 156 | // JavaScript execution 157 | js_code?: string | string[]; 158 | js_only?: boolean; 159 | wait_for?: string; 160 | wait_for_timeout?: number; 161 | 162 | // Page navigation & timing 163 | wait_until?: 'domcontentloaded' | 'networkidle' | 'load'; 164 | page_timeout?: number; 165 | wait_for_images?: boolean; 166 | ignore_body_visibility?: boolean; 167 | 168 | // Dynamic content handling 169 | delay_before_scroll?: number; 170 | scroll_delay?: number; 171 | scan_full_page?: boolean; 172 | virtual_scroll_config?: VirtualScrollConfig; 173 | 174 | // Content processing 175 | process_iframes?: boolean; 176 | exclude_external_links?: boolean; 177 | 178 | // Media handling 179 | screenshot?: boolean; 180 | screenshot_wait_for?: number; 181 | pdf?: boolean; 182 | capture_mhtml?: boolean; 183 | image_description_min_word_threshold?: number; 184 | image_score_threshold?: number; 185 | exclude_external_images?: boolean; 186 | 187 | // Link filtering 188 | exclude_social_media_links?: boolean; 189 | exclude_domains?: string[]; 190 | 191 | // Page interaction 192 | simulate_user?: boolean; 193 | override_navigator?: boolean; 194 | magic?: boolean; 195 | 196 | // Session management 197 | session_id?: string; 198 | 199 | // Cache control 200 | cache_mode?: 'ENABLED' | 'BYPASS' | 'DISABLED'; 201 | 202 | // Performance options 203 | timeout?: number; 204 | verbose?: boolean; 205 | 206 | // Debug 207 | log_console?: boolean; 208 | 209 | // New parameters from 0.7.3/0.7.4 210 | delay_before_return_html?: number; // Delay in ms before capturing final HTML 211 | css_selector?: string; // CSS selector to extract specific elements 212 | include_links?: boolean; // Whether to include links in the response 213 | resolve_absolute_urls?: boolean; // Convert relative URLs to absolute ones 214 | } 215 | 216 | // Extraction strategy passthrough objects (new in 0.7.3/0.7.4) 217 | export interface ExtractionStrategy { 218 | [key: string]: unknown; 219 | } 220 | 221 | export interface TableExtractionStrategy { 222 | enable_chunking?: boolean; 223 | thresholds?: Record<string, unknown>; 224 | [key: string]: unknown; 225 | } 226 | 227 | export interface MarkdownGeneratorOptions { 228 | include_links?: boolean; 229 | [key: string]: unknown; 230 | } 231 | 232 | // Advanced crawl configuration combining browser and crawler configs 233 | export interface AdvancedCrawlConfig { 234 | url?: string; 235 | urls?: string[]; 236 | browser_config?: BrowserConfig; 237 | crawler_config?: CrawlerConfig; 238 | priority?: number; 239 | extraction_strategy?: ExtractionStrategy; 240 | table_extraction_strategy?: TableExtractionStrategy; 241 | markdown_generator_options?: MarkdownGeneratorOptions; 242 | } 243 | 244 | // Session management types (used internally by MCP server) 245 | export interface SessionInfo { 246 | id: string; 247 | created_at: Date; 248 | last_used: Date; 249 | initial_url?: string; 250 | metadata?: Record<string, unknown>; 251 | } 252 | 253 | // Crawl endpoint types 254 | export interface CrawlEndpointOptions { 255 | urls: string[]; 256 | browser_config?: BrowserConfig; 257 | crawler_config?: CrawlerConfig; 258 | } 259 | 260 | export interface CrawlMarkdownResult { 261 | raw_markdown: string; 262 | markdown_with_citations: string; 263 | references_markdown: string; 264 | fit_markdown: string; 265 | fit_html: string; 266 | } 267 | 268 | export interface CrawlMediaResult { 269 | images: Array<{ 270 | src?: string | null; 271 | data?: string; 272 | alt?: string | null; 273 | desc?: string; 274 | score?: number; 275 | type?: string; 276 | group_id?: number; 277 | format?: string | null; 278 | width?: number | null; 279 | }>; 280 | videos: Array<{ 281 | src?: string | null; 282 | data?: string; 283 | alt?: string | null; 284 | desc?: string; 285 | score?: number; 286 | type?: string; 287 | group_id?: number; 288 | format?: string | null; 289 | width?: number | null; 290 | }>; 291 | audios: Array<{ 292 | src?: string | null; 293 | data?: string; 294 | alt?: string | null; 295 | desc?: string; 296 | score?: number; 297 | type?: string; 298 | group_id?: number; 299 | format?: string | null; 300 | width?: number | null; 301 | }>; 302 | } 303 | 304 | interface LinkItem { 305 | href: string; 306 | text: string; 307 | title: string; 308 | base_domain?: string | null; 309 | head_data?: Record<string, unknown> | null; 310 | head_extraction_status?: string | null; 311 | head_extraction_error?: string | null; 312 | intrinsic_score?: number; 313 | contextual_score?: number | null; 314 | total_score?: number | null; 315 | } 316 | 317 | export interface CrawlLinksResult { 318 | internal: LinkItem[]; 319 | external: LinkItem[]; 320 | } 321 | 322 | export interface CrawlResultItem { 323 | url: string; 324 | html: string; 325 | cleaned_html: string; 326 | fit_html: string; 327 | success: boolean; 328 | error_message?: string; 329 | status_code: number; 330 | response_headers: Record<string, unknown>; 331 | redirected_url?: string; 332 | session_id: string | null; 333 | metadata: Record<string, unknown>; 334 | links: CrawlLinksResult; 335 | media: CrawlMediaResult; 336 | markdown: CrawlMarkdownResult; 337 | tables: unknown[]; 338 | extracted_content: unknown | null; 339 | screenshot: string | null; // base64 PNG when screenshot: true 340 | pdf: string | null; // base64 PDF when pdf: true 341 | mhtml: string | null; 342 | js_execution_result: { 343 | success: boolean; 344 | results: unknown[]; 345 | } | null; 346 | downloaded_files: unknown | null; 347 | network_requests: unknown | null; 348 | console_messages: unknown | null; 349 | ssl_certificate: unknown | null; 350 | dispatch_result: unknown | null; 351 | } 352 | 353 | export interface CrawlEndpointResponse { 354 | success: boolean; 355 | results: CrawlResultItem[]; 356 | server_processing_time_s: number; 357 | server_memory_delta_mb: number; 358 | server_peak_memory_mb: number; 359 | } 360 | ``` -------------------------------------------------------------------------------- /src/schemas/validation-schemas.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { z } from 'zod'; 2 | import { validateJavaScriptCode, createStatelessSchema } from './helpers.js'; 3 | 4 | export const JsCodeSchema = z 5 | .union([ 6 | z.string().refine(validateJavaScriptCode, { 7 | message: 8 | 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', 9 | }), 10 | z.array( 11 | z.string().refine(validateJavaScriptCode, { 12 | message: 13 | 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', 14 | }), 15 | ), 16 | ]) 17 | .describe('JavaScript code as string or array of strings'); 18 | 19 | export const VirtualScrollConfigSchema = z.object({ 20 | container_selector: z.string(), 21 | scroll_count: z.number().optional(), 22 | scroll_by: z.union([z.string(), z.number()]).optional(), 23 | wait_after_scroll: z.number().optional(), 24 | }); 25 | 26 | const GetMarkdownBaseSchema = z.object({ 27 | url: z.string().url(), 28 | filter: z.enum(['raw', 'fit', 'bm25', 'llm']).optional().default('fit'), 29 | query: z.string().optional(), 30 | cache: z.string().optional().default('0'), 31 | }); 32 | 33 | export const GetMarkdownSchema = createStatelessSchema(GetMarkdownBaseSchema, 'get_markdown').refine( 34 | (data) => { 35 | // If filter is bm25 or llm, query is required 36 | if ((data.filter === 'bm25' || data.filter === 'llm') && !data.query) { 37 | return false; 38 | } 39 | return true; 40 | }, 41 | { 42 | message: 'Query parameter is required when using bm25 or llm filter', 43 | path: ['query'], 44 | }, 45 | ); 46 | 47 | export const ExecuteJsSchema = createStatelessSchema( 48 | z.object({ 49 | url: z.string().url(), 50 | scripts: JsCodeSchema, 51 | }), 52 | 'execute_js', 53 | ); 54 | 55 | export const GetHtmlSchema = createStatelessSchema( 56 | z.object({ 57 | url: z.string().url(), 58 | }), 59 | 'get_html', 60 | ); 61 | 62 | export const CaptureScreenshotSchema = createStatelessSchema( 63 | z.object({ 64 | url: z.string().url(), 65 | screenshot_wait_for: z.number().optional(), 66 | save_to_directory: z.string().optional().describe('Local directory to save screenshot file'), 67 | // output_path not exposed as MCP needs base64 data 68 | }), 69 | 'capture_screenshot', 70 | ); 71 | 72 | export const GeneratePdfSchema = createStatelessSchema( 73 | z.object({ 74 | url: z.string().url(), 75 | // Only url is supported - output_path not exposed as MCP needs base64 data 76 | }), 77 | 'generate_pdf', 78 | ); 79 | 80 | export const ExtractWithLlmSchema = createStatelessSchema( 81 | z.object({ 82 | url: z.string().url(), 83 | query: z.string(), 84 | }), 85 | 'extract_with_llm', 86 | ); 87 | 88 | export const BatchCrawlSchema = createStatelessSchema( 89 | z.object({ 90 | urls: z.array(z.string().url()), 91 | max_concurrent: z.number().optional(), 92 | remove_images: z.boolean().optional(), 93 | bypass_cache: z.boolean().optional(), 94 | // New: Support per-URL configs array (0.7.3/0.7.4) 95 | configs: z 96 | .array( 97 | z.object({ 98 | url: z.string().url(), 99 | browser_config: z.record(z.unknown()).optional(), 100 | crawler_config: z.record(z.unknown()).optional(), 101 | extraction_strategy: z.record(z.unknown()).optional(), 102 | table_extraction_strategy: z.record(z.unknown()).optional(), 103 | markdown_generator_options: z.record(z.unknown()).optional(), 104 | matcher: z.union([z.string(), z.function()]).optional(), 105 | }), 106 | ) 107 | .optional(), 108 | }), 109 | 'batch_crawl', 110 | ); 111 | 112 | export const SmartCrawlSchema = createStatelessSchema( 113 | z.object({ 114 | url: z.string().url(), 115 | max_depth: z.number().optional(), 116 | follow_links: z.boolean().optional(), 117 | bypass_cache: z.boolean().optional(), 118 | }), 119 | 'smart_crawl', 120 | ); 121 | 122 | export const ExtractLinksSchema = createStatelessSchema( 123 | z.object({ 124 | url: z.string().url(), 125 | categorize: z.boolean().optional().default(true), 126 | }), 127 | 'extract_links', 128 | ); 129 | 130 | export const CrawlRecursiveSchema = createStatelessSchema( 131 | z.object({ 132 | url: z.string().url(), 133 | max_depth: z.number().optional(), 134 | max_pages: z.number().optional(), 135 | include_pattern: z.string().optional(), 136 | exclude_pattern: z.string().optional(), 137 | }), 138 | 'crawl_recursive', 139 | ); 140 | 141 | export const ParseSitemapSchema = createStatelessSchema( 142 | z.object({ 143 | url: z.string().url(), 144 | filter_pattern: z.string().optional(), 145 | }), 146 | 'parse_sitemap', 147 | ); 148 | 149 | // Unified session management schema 150 | export const ManageSessionSchema = z.discriminatedUnion('action', [ 151 | z.object({ 152 | action: z.literal('create'), 153 | session_id: z.string().optional(), 154 | initial_url: z.string().url().optional(), 155 | browser_type: z.enum(['chromium', 'firefox', 'webkit']).optional(), 156 | }), 157 | z.object({ 158 | action: z.literal('clear'), 159 | session_id: z.string(), 160 | }), 161 | z.object({ 162 | action: z.literal('list'), 163 | }), 164 | ]); 165 | 166 | export const CrawlSchema = z 167 | .object({ 168 | url: z.string().url(), 169 | 170 | // Browser configuration 171 | browser_type: z.enum(['chromium', 'firefox', 'webkit']).optional(), 172 | viewport_width: z.number().optional(), 173 | viewport_height: z.number().optional(), 174 | user_agent: z.string().optional(), 175 | proxy_server: z.string().optional(), 176 | proxy_username: z.string().optional(), 177 | proxy_password: z.string().optional(), 178 | cookies: z 179 | .array( 180 | z.object({ 181 | name: z.string(), 182 | value: z.string(), 183 | domain: z.string(), 184 | path: z.string().optional(), 185 | }), 186 | ) 187 | .optional(), 188 | headers: z.record(z.string()).optional(), 189 | extra_args: z.array(z.string()).optional(), 190 | 191 | // Content filtering 192 | word_count_threshold: z.number().optional(), 193 | excluded_tags: z.array(z.string()).optional(), 194 | excluded_selector: z.string().optional(), 195 | remove_overlay_elements: z.boolean().optional(), 196 | only_text: z.boolean().optional(), 197 | remove_forms: z.boolean().optional(), 198 | keep_data_attributes: z.boolean().optional(), 199 | 200 | // JavaScript execution 201 | js_code: JsCodeSchema.optional(), 202 | js_only: z.boolean().optional(), 203 | wait_for: z.string().optional(), 204 | wait_for_timeout: z.number().optional(), 205 | 206 | // Page navigation & timing 207 | wait_until: z.enum(['domcontentloaded', 'networkidle', 'load']).optional(), 208 | page_timeout: z.number().optional(), 209 | wait_for_images: z.boolean().optional(), 210 | ignore_body_visibility: z.boolean().optional(), 211 | 212 | // Dynamic content 213 | delay_before_scroll: z.number().optional(), 214 | scroll_delay: z.number().optional(), 215 | scan_full_page: z.boolean().optional(), 216 | virtual_scroll_config: VirtualScrollConfigSchema.optional(), 217 | 218 | // Content processing 219 | process_iframes: z.boolean().optional(), 220 | exclude_external_links: z.boolean().optional(), 221 | 222 | // Media handling 223 | screenshot: z.boolean().optional(), 224 | screenshot_wait_for: z.number().optional(), 225 | screenshot_directory: z 226 | .string() 227 | .optional() 228 | .describe('Local directory to save screenshot file when screenshot=true'), 229 | pdf: z.boolean().optional(), 230 | capture_mhtml: z.boolean().optional(), 231 | image_description_min_word_threshold: z.number().optional(), 232 | image_score_threshold: z.number().optional(), 233 | exclude_external_images: z.boolean().optional(), 234 | 235 | // Link filtering 236 | exclude_social_media_links: z.boolean().optional(), 237 | exclude_domains: z.array(z.string()).optional(), 238 | 239 | // Page interaction 240 | simulate_user: z.boolean().optional(), 241 | override_navigator: z.boolean().optional(), 242 | magic: z.boolean().optional(), 243 | 244 | // Session and cache 245 | session_id: z.string().optional(), 246 | cache_mode: z.enum(['ENABLED', 'BYPASS', 'DISABLED']).optional(), 247 | 248 | // Performance options 249 | timeout: z.number().optional(), 250 | verbose: z.boolean().optional(), 251 | 252 | // Debug 253 | log_console: z.boolean().optional(), 254 | 255 | // New parameters from 0.7.3/0.7.4 256 | delay_before_return_html: z.number().optional(), 257 | css_selector: z.string().optional(), 258 | include_links: z.boolean().optional(), 259 | resolve_absolute_urls: z.boolean().optional(), 260 | }) 261 | .refine( 262 | (data) => { 263 | // js_only is for subsequent calls in same session, not first call 264 | // Using it incorrectly causes server errors 265 | if (data.js_only && !data.session_id) { 266 | return false; 267 | } 268 | return true; 269 | }, 270 | { 271 | message: 272 | "Error: js_only requires session_id (it's for continuing existing sessions).\n" + 273 | 'For first call with js_code, use: {js_code: [...], screenshot: true}\n' + 274 | 'For multi-step: First {js_code: [...], session_id: "x"}, then {js_only: true, session_id: "x"}', 275 | }, 276 | ) 277 | .refine( 278 | (data) => { 279 | // Empty js_code array is not allowed 280 | if (Array.isArray(data.js_code) && data.js_code.length === 0) { 281 | return false; 282 | } 283 | return true; 284 | }, 285 | { 286 | message: 287 | 'Error: js_code array cannot be empty. Either provide JavaScript code to execute or remove the js_code parameter entirely.', 288 | }, 289 | ); 290 | 291 | // Re-export types we need 292 | export type { z }; 293 | ``` -------------------------------------------------------------------------------- /src/__tests__/schemas/validation-edge-cases.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | // import { jest } from '@jest/globals'; 2 | import { validateJavaScriptCode } from '../../schemas/helpers.js'; 3 | import { JsCodeSchema, CrawlSchema } from '../../schemas/validation-schemas.js'; 4 | 5 | describe('JavaScript Validation Edge Cases', () => { 6 | describe('validateJavaScriptCode', () => { 7 | describe('Valid JavaScript that might look suspicious', () => { 8 | it('should accept strings containing HTML-like syntax in string literals', () => { 9 | const validCases = [ 10 | `const html = '<div class="test">Hello</div>';`, 11 | `const template = \`<button onclick="alert('test')">Click</button>\`;`, 12 | `const regex = /<div[^>]*>/g;`, 13 | `const arrow = () => { return '<span>Arrow</span>'; }`, 14 | `const className = 'container';`, 15 | ]; 16 | 17 | validCases.forEach((code) => { 18 | expect(validateJavaScriptCode(code)).toBe(true); 19 | }); 20 | }); 21 | 22 | it('should accept legitimate escape sequences', () => { 23 | const validCases = [ 24 | `const str = "Line 1\\nLine 2";`, // Real newline escape 25 | `const tab = "Col1\\tCol2";`, 26 | `const quote = "He said \\"Hello\\"";`, 27 | `const unicode = "\\u0048\\u0065\\u006C\\u006C\\u006F";`, 28 | `const template = \`Multi 29 | line 30 | string\`;`, // Real newlines in template literals 31 | ]; 32 | 33 | validCases.forEach((code) => { 34 | expect(validateJavaScriptCode(code)).toBe(true); 35 | }); 36 | }); 37 | 38 | it('should accept complex but valid JavaScript patterns', () => { 39 | const validCases = [ 40 | // Nested template literals 41 | `const nested = \`Outer \${inner ? \`Inner: \${value}\` : 'None'}\`;`, 42 | // Regular expressions that might look like HTML 43 | `const htmlTag = /<([a-z]+)([^>]*)>/gi;`, 44 | // JSON strings without HTML entities 45 | `const json = '{"name": "Test", "value": "Some data"}';`, 46 | // Function with HTML in comments 47 | `function render() { 48 | // This creates div content 49 | return document.createElement('div'); 50 | }`, 51 | // Complex string concatenation 52 | `const result = '<div' + ' class="' + className + '">' + content + '</div>';`, 53 | ]; 54 | 55 | validCases.forEach((code) => { 56 | expect(validateJavaScriptCode(code)).toBe(true); 57 | }); 58 | }); 59 | 60 | it('should accept Unicode and special characters', () => { 61 | const validCases = [ 62 | `const emoji = "Hello 👋 World 🌍";`, 63 | `const chinese = "你好世界";`, 64 | `const arabic = "مرحبا بالعالم";`, 65 | `const special = "©2024 Company™";`, 66 | `const math = "∑(n=1 to ∞) = π²/6";`, 67 | ]; 68 | 69 | validCases.forEach((code) => { 70 | expect(validateJavaScriptCode(code)).toBe(true); 71 | }); 72 | }); 73 | }); 74 | 75 | describe('Invalid JavaScript that should be rejected', () => { 76 | it('should reject HTML entities outside string literals', () => { 77 | const invalidCases = [ 78 | `const value = "test";`, // HTML entities as code 79 | `const text = && true;`, 80 | `if (a < b) { }`, 81 | `const escaped = `, 82 | `return 'hello';`, 83 | ]; 84 | 85 | invalidCases.forEach((code) => { 86 | expect(validateJavaScriptCode(code)).toBe(false); 87 | }); 88 | }); 89 | 90 | it('should reject literal backslash-n outside strings', () => { 91 | const invalidCases = [ 92 | `const text = "Hello";\\nconst world = "World";`, // Literal \n between statements 93 | `console.log("test");\\nconsole.log("more");`, 94 | `return value;\\nreturn other;`, 95 | ]; 96 | 97 | invalidCases.forEach((code) => { 98 | expect(validateJavaScriptCode(code)).toBe(false); 99 | }); 100 | }); 101 | 102 | it('should reject HTML tags outside string literals', () => { 103 | const invalidCases = [ 104 | `<script>alert('test')</script>`, 105 | `<!DOCTYPE html>`, 106 | `<html><body>test</body></html>`, 107 | `<style>body { color: red; }</style>`, 108 | ]; 109 | 110 | invalidCases.forEach((code) => { 111 | expect(validateJavaScriptCode(code)).toBe(false); 112 | }); 113 | }); 114 | }); 115 | 116 | describe('Edge cases and boundaries', () => { 117 | it('should handle empty and whitespace-only input', () => { 118 | expect(validateJavaScriptCode('')).toBe(true); 119 | expect(validateJavaScriptCode(' ')).toBe(true); 120 | expect(validateJavaScriptCode('\n\n\n')).toBe(true); 121 | expect(validateJavaScriptCode('\t')).toBe(true); 122 | }); 123 | 124 | it('should handle very long valid strings', () => { 125 | const longString = 'const x = "' + 'a'.repeat(10000) + '";'; 126 | expect(validateJavaScriptCode(longString)).toBe(true); 127 | }); 128 | 129 | it('should handle nested quotes correctly', () => { 130 | const validCases = [ 131 | `const x = "She said \\"Hello\\" to me";`, 132 | `const y = 'It\\'s a nice day';`, 133 | `const z = \`Template with "quotes" and 'apostrophes'\`;`, 134 | ]; 135 | 136 | validCases.forEach((code) => { 137 | expect(validateJavaScriptCode(code)).toBe(true); 138 | }); 139 | }); 140 | 141 | it('should handle multiline strings correctly', () => { 142 | const multiline = ` 143 | const longText = \` 144 | This is a multiline 145 | template literal with 146 | multiple lines 147 | \`;`; 148 | expect(validateJavaScriptCode(multiline)).toBe(true); 149 | }); 150 | }); 151 | }); 152 | 153 | describe('Schema Validation Edge Cases', () => { 154 | describe('JsCodeSchema', () => { 155 | it('should accept both string and array of strings', () => { 156 | expect(() => JsCodeSchema.parse('return 1;')).not.toThrow(); 157 | expect(() => JsCodeSchema.parse(['return 1;', 'return 2;'])).not.toThrow(); 158 | }); 159 | 160 | it('should reject invalid JavaScript in arrays', () => { 161 | expect(() => JsCodeSchema.parse(['valid();', '"invalid"'])).toThrow(); 162 | }); 163 | 164 | it('should handle empty arrays', () => { 165 | expect(() => JsCodeSchema.parse([])).not.toThrow(); 166 | }); 167 | }); 168 | 169 | describe('CrawlSchema edge cases', () => { 170 | it('should handle all optional parameters', () => { 171 | const minimal = { url: 'https://example.com' }; 172 | expect(() => CrawlSchema.parse(minimal)).not.toThrow(); 173 | }); 174 | 175 | it('should validate js_only requires session_id', () => { 176 | const invalid = { 177 | url: 'https://example.com', 178 | js_only: true, 179 | // Missing session_id 180 | }; 181 | expect(() => CrawlSchema.parse(invalid)).toThrow(); 182 | }); 183 | 184 | it('should reject empty js_code array', () => { 185 | const invalid = { 186 | url: 'https://example.com', 187 | js_code: [], 188 | }; 189 | expect(() => CrawlSchema.parse(invalid)).toThrow(); 190 | }); 191 | 192 | it('should accept all valid cache modes', () => { 193 | const validModes = ['ENABLED', 'BYPASS', 'DISABLED']; 194 | validModes.forEach((mode) => { 195 | const config = { url: 'https://example.com', cache_mode: mode }; 196 | expect(() => CrawlSchema.parse(config)).not.toThrow(); 197 | }); 198 | }); 199 | 200 | it('should validate viewport dimensions', () => { 201 | const validViewport = { 202 | url: 'https://example.com', 203 | viewport_width: 1920, 204 | viewport_height: 1080, 205 | }; 206 | expect(() => CrawlSchema.parse(validViewport)).not.toThrow(); 207 | }); 208 | 209 | it('should validate complex configurations', () => { 210 | const complex = { 211 | url: 'https://example.com', 212 | browser_type: 'chromium', 213 | viewport_width: 1280, 214 | viewport_height: 720, 215 | user_agent: 'Custom User Agent', 216 | headers: { 'X-Custom': 'value' }, 217 | cookies: [{ name: 'session', value: '123', domain: '.example.com' }], 218 | js_code: ['document.querySelector("button").click()'], 219 | wait_for: '#loaded', 220 | screenshot: true, 221 | pdf: true, 222 | session_id: 'test-session', 223 | cache_mode: 'BYPASS', 224 | }; 225 | expect(() => CrawlSchema.parse(complex)).not.toThrow(); 226 | }); 227 | }); 228 | }); 229 | 230 | describe('Property-based testing for regex patterns', () => { 231 | // Generate random valid JavaScript-like strings 232 | const generateValidJS = () => { 233 | const templates = [ 234 | () => `const x = ${Math.random()};`, 235 | () => `function test() { return "${Math.random()}"; }`, 236 | () => `if (${Math.random() > 0.5}) { console.log("test"); }`, 237 | () => `const arr = [${Math.random()}, ${Math.random()}];`, 238 | () => `// Comment with ${Math.random()}`, 239 | ]; 240 | return templates[Math.floor(Math.random() * templates.length)](); 241 | }; 242 | 243 | it('should consistently validate generated valid JavaScript', () => { 244 | for (let i = 0; i < 100; i++) { 245 | const code = generateValidJS(); 246 | expect(validateJavaScriptCode(code)).toBe(true); 247 | } 248 | }); 249 | 250 | // Test boundary conditions with special characters 251 | const specialChars = ['<', '>', '&', '"', "'", '\\', '\n', '\r', '\t']; 252 | 253 | it('should handle special characters in string contexts correctly', () => { 254 | specialChars.forEach((char) => { 255 | const inString = `const x = "${char}";`; 256 | const inTemplate = `const y = \`${char}\`;`; 257 | 258 | // These should be valid (special chars inside strings) 259 | expect(validateJavaScriptCode(inString)).toBe(true); 260 | expect(validateJavaScriptCode(inTemplate)).toBe(true); 261 | }); 262 | }); 263 | }); 264 | }); 265 | ``` -------------------------------------------------------------------------------- /src/handlers/utility-handlers.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { BaseHandler } from './base-handler.js'; 2 | import { JSExecuteEndpointOptions, JSExecuteEndpointResponse, CrawlResultItem } from '../types.js'; 3 | 4 | export class UtilityHandlers extends BaseHandler { 5 | async executeJS(options: JSExecuteEndpointOptions) { 6 | try { 7 | // Check if scripts is provided 8 | if (!options.scripts || options.scripts === null) { 9 | throw new Error( 10 | 'scripts is required. Please provide JavaScript code to execute. Use "return" statements to get values back.', 11 | ); 12 | } 13 | 14 | const result: JSExecuteEndpointResponse = await this.service.executeJS(options); 15 | 16 | // Extract JavaScript execution results 17 | const jsResults = result.js_execution_result?.results || []; 18 | // Ensure scripts is always an array for mapping 19 | const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts]; 20 | 21 | // Format results for display 22 | let formattedResults = ''; 23 | if (jsResults.length > 0) { 24 | formattedResults = jsResults 25 | .map((res: unknown, idx: number) => { 26 | const script = scripts[idx] || 'Script ' + (idx + 1); 27 | // Handle the actual return value or success/error status 28 | let resultStr = ''; 29 | if (res && typeof res === 'object' && 'success' in res) { 30 | // This is a status object (e.g., from null return or execution without return) 31 | const statusObj = res as { success: unknown; error?: unknown }; 32 | resultStr = statusObj.success 33 | ? 'Executed successfully (no return value)' 34 | : `Error: ${statusObj.error || 'Unknown error'}`; 35 | } else { 36 | // This is an actual return value 37 | resultStr = JSON.stringify(res, null, 2); 38 | } 39 | return `Script: ${script}\nReturned: ${resultStr}`; 40 | }) 41 | .join('\n\n'); 42 | } else { 43 | formattedResults = 'No results returned'; 44 | } 45 | 46 | // Handle markdown content - can be string or object 47 | let markdownContent = ''; 48 | if (result.markdown) { 49 | if (typeof result.markdown === 'string') { 50 | markdownContent = result.markdown; 51 | } else if (typeof result.markdown === 'object' && result.markdown.raw_markdown) { 52 | // Use raw_markdown from the object structure 53 | markdownContent = result.markdown.raw_markdown; 54 | } 55 | } 56 | 57 | return { 58 | content: [ 59 | { 60 | type: 'text', 61 | text: `JavaScript executed on: ${options.url}\n\nResults:\n${formattedResults}${markdownContent ? `\n\nPage Content After Execution:\n${markdownContent}` : ''}`, 62 | }, 63 | ], 64 | }; 65 | } catch (error) { 66 | throw this.formatError(error, 'execute JavaScript'); 67 | } 68 | } 69 | 70 | async extractLinks(options: { url: string; categorize?: boolean }) { 71 | try { 72 | // Use crawl endpoint instead of md to get full link data 73 | const response = await this.axiosClient.post('/crawl', { 74 | urls: [options.url], 75 | crawler_config: { 76 | cache_mode: 'bypass', 77 | }, 78 | }); 79 | 80 | const results = response.data.results || [response.data]; 81 | const result: CrawlResultItem = results[0] || {}; 82 | 83 | // Variables for manually extracted links 84 | let manuallyExtractedInternal: string[] = []; 85 | let manuallyExtractedExternal: string[] = []; 86 | let hasManuallyExtractedLinks = false; 87 | 88 | // Check if the response is likely JSON or non-HTML content 89 | if (!result.links || (result.links.internal.length === 0 && result.links.external.length === 0)) { 90 | // Try to detect if this might be a JSON endpoint 91 | const markdownContent = result.markdown?.raw_markdown || result.markdown?.fit_markdown || ''; 92 | const htmlContent = result.html || ''; 93 | 94 | // Check for JSON indicators 95 | if ( 96 | // Check URL pattern 97 | options.url.includes('/api/') || 98 | options.url.includes('/api.') || 99 | // Check content type (often shown in markdown conversion) 100 | markdownContent.includes('application/json') || 101 | // Check for JSON structure patterns 102 | (markdownContent.startsWith('{') && markdownContent.endsWith('}')) || 103 | (markdownContent.startsWith('[') && markdownContent.endsWith(']')) || 104 | // Check HTML for JSON indicators 105 | htmlContent.includes('application/json') || 106 | // Common JSON patterns 107 | markdownContent.includes('"links"') || 108 | markdownContent.includes('"url"') || 109 | markdownContent.includes('"data"') 110 | ) { 111 | return { 112 | content: [ 113 | { 114 | type: 'text', 115 | text: `Note: ${options.url} appears to return JSON data rather than HTML. The extract_links tool is designed for HTML pages with <a> tags. To extract URLs from JSON, you would need to parse the JSON structure directly.`, 116 | }, 117 | ], 118 | }; 119 | } 120 | // If no links found but it's HTML, let's check the markdown content for href patterns 121 | if (markdownContent && markdownContent.includes('href=')) { 122 | // Extract links manually from markdown if server didn't provide them 123 | const hrefPattern = /href=["']([^"']+)["']/g; 124 | const foundLinks: string[] = []; 125 | let match; 126 | while ((match = hrefPattern.exec(markdownContent)) !== null) { 127 | foundLinks.push(match[1]); 128 | } 129 | if (foundLinks.length > 0) { 130 | hasManuallyExtractedLinks = true; 131 | // Categorize found links 132 | const currentDomain = new URL(options.url).hostname; 133 | 134 | foundLinks.forEach((link) => { 135 | try { 136 | const linkUrl = new URL(link, options.url); 137 | if (linkUrl.hostname === currentDomain) { 138 | manuallyExtractedInternal.push(linkUrl.href); 139 | } else { 140 | manuallyExtractedExternal.push(linkUrl.href); 141 | } 142 | } catch { 143 | // Relative link 144 | manuallyExtractedInternal.push(link); 145 | } 146 | }); 147 | } 148 | } 149 | } 150 | 151 | // Handle both cases: API-provided links and manually extracted links 152 | let internalUrls: string[] = []; 153 | let externalUrls: string[] = []; 154 | 155 | if (result.links && (result.links.internal.length > 0 || result.links.external.length > 0)) { 156 | // Use API-provided links 157 | internalUrls = result.links.internal.map((link) => (typeof link === 'string' ? link : link.href)); 158 | externalUrls = result.links.external.map((link) => (typeof link === 'string' ? link : link.href)); 159 | } else if (hasManuallyExtractedLinks) { 160 | // Use manually extracted links 161 | internalUrls = manuallyExtractedInternal; 162 | externalUrls = manuallyExtractedExternal; 163 | } 164 | 165 | const allUrls = [...internalUrls, ...externalUrls]; 166 | 167 | if (!options.categorize) { 168 | return { 169 | content: [ 170 | { 171 | type: 'text', 172 | text: `All links from ${options.url}:\n${allUrls.join('\n')}`, 173 | }, 174 | ], 175 | }; 176 | } 177 | 178 | // Categorize links 179 | const categorized: Record<string, string[]> = { 180 | internal: [], 181 | external: [], 182 | social: [], 183 | documents: [], 184 | images: [], 185 | scripts: [], 186 | }; 187 | 188 | // Further categorize links 189 | const socialDomains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'youtube.com']; 190 | const docExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']; 191 | const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp']; 192 | const scriptExtensions = ['.js', '.css']; 193 | 194 | // Categorize internal URLs 195 | internalUrls.forEach((href: string) => { 196 | if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { 197 | categorized.documents.push(href); 198 | } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { 199 | categorized.images.push(href); 200 | } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { 201 | categorized.scripts.push(href); 202 | } else { 203 | categorized.internal.push(href); 204 | } 205 | }); 206 | 207 | // Categorize external URLs 208 | externalUrls.forEach((href: string) => { 209 | if (socialDomains.some((domain) => href.includes(domain))) { 210 | categorized.social.push(href); 211 | } else if (docExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { 212 | categorized.documents.push(href); 213 | } else if (imageExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { 214 | categorized.images.push(href); 215 | } else if (scriptExtensions.some((ext) => href.toLowerCase().endsWith(ext))) { 216 | categorized.scripts.push(href); 217 | } else { 218 | categorized.external.push(href); 219 | } 220 | }); 221 | 222 | // Return based on categorize option (defaults to true) 223 | if (options.categorize) { 224 | return { 225 | content: [ 226 | { 227 | type: 'text', 228 | text: `Link analysis for ${options.url}:\n\n${Object.entries(categorized) 229 | .map( 230 | ([category, links]: [string, string[]]) => 231 | `${category} (${links.length}):\n${links.slice(0, 10).join('\n')}${links.length > 10 ? '\n...' : ''}`, 232 | ) 233 | .join('\n\n')}`, 234 | }, 235 | ], 236 | }; 237 | } else { 238 | // Return simple list without categorization 239 | const allLinks = [...internalUrls, ...externalUrls]; 240 | return { 241 | content: [ 242 | { 243 | type: 'text', 244 | text: `All links from ${options.url} (${allLinks.length} total):\n\n${allLinks.slice(0, 50).join('\n')}${allLinks.length > 50 ? '\n...' : ''}`, 245 | }, 246 | ], 247 | }; 248 | } 249 | } catch (error) { 250 | throw this.formatError(error, 'extract links'); 251 | } 252 | } 253 | } 254 | ``` -------------------------------------------------------------------------------- /src/crawl4ai-service.ts: -------------------------------------------------------------------------------- ```typescript 1 | import axios, { AxiosInstance, AxiosError } from 'axios'; 2 | import { 3 | BatchCrawlOptions, 4 | AdvancedCrawlConfig, 5 | CrawlEndpointOptions, 6 | CrawlEndpointResponse, 7 | JSExecuteEndpointOptions, 8 | JSExecuteEndpointResponse, 9 | ScreenshotEndpointOptions, 10 | ScreenshotEndpointResponse, 11 | PDFEndpointOptions, 12 | PDFEndpointResponse, 13 | HTMLEndpointOptions, 14 | HTMLEndpointResponse, 15 | MarkdownEndpointOptions, 16 | MarkdownEndpointResponse, 17 | LLMEndpointOptions, 18 | LLMEndpointResponse, 19 | } from './types.js'; 20 | 21 | // Helper to validate JavaScript code 22 | const validateJavaScriptCode = (code: string): boolean => { 23 | // Check for common HTML entities that shouldn't be in JS 24 | if (/"|&|<|>|&#\d+;|&\w+;/.test(code)) { 25 | return false; 26 | } 27 | 28 | // Basic check to ensure it's not HTML 29 | if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) { 30 | return false; 31 | } 32 | 33 | // Check for literal \n, \t, \r outside of strings (common LLM mistake) 34 | // Look for patterns like: ;\n or }\n or )\n which suggest literal newlines 35 | if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) { 36 | return false; 37 | } 38 | 39 | // Check for obvious cases of literal \n between statements 40 | if (/[;})]\s*\\n\s*\w/.test(code)) { 41 | return false; 42 | } 43 | 44 | return true; 45 | }; 46 | 47 | // Helper to validate URL format 48 | const validateURL = (url: string): boolean => { 49 | try { 50 | new URL(url); 51 | return true; 52 | } catch { 53 | return false; 54 | } 55 | }; 56 | 57 | // Helper to handle axios errors consistently 58 | const handleAxiosError = (error: unknown): never => { 59 | if (axios.isAxiosError(error)) { 60 | const axiosError = error as AxiosError; 61 | 62 | // Handle timeout errors 63 | if (axiosError.code === 'ECONNABORTED') { 64 | throw new Error('Request timed out'); 65 | } 66 | 67 | if (axiosError.code === 'ETIMEDOUT') { 68 | throw new Error('Request timeout'); 69 | } 70 | 71 | // Handle network errors 72 | if (axiosError.code === 'ENOTFOUND') { 73 | throw new Error(`DNS resolution failed: ${axiosError.message}`); 74 | } 75 | 76 | if (axiosError.code === 'ECONNREFUSED') { 77 | throw new Error(`Connection refused: ${axiosError.message}`); 78 | } 79 | 80 | if (axiosError.code === 'ECONNRESET') { 81 | throw new Error(`Connection reset: ${axiosError.message}`); 82 | } 83 | 84 | if (axiosError.code === 'ENETUNREACH') { 85 | throw new Error(`Network unreachable: ${axiosError.message}`); 86 | } 87 | 88 | // Handle HTTP errors 89 | if (axiosError.response) { 90 | const status = axiosError.response.status; 91 | const data = axiosError.response.data as any; // eslint-disable-line @typescript-eslint/no-explicit-any 92 | const message = data?.error || data?.detail || data?.message || axiosError.message; 93 | throw new Error(`Request failed with status ${status}: ${message}`); 94 | } 95 | 96 | // Handle request errors (e.g., invalid URL) 97 | if (axiosError.request) { 98 | throw new Error(`Request failed: ${axiosError.message}`); 99 | } 100 | } 101 | 102 | // Re-throw unknown errors 103 | throw error; 104 | }; 105 | 106 | export class Crawl4AIService { 107 | private axiosClient: AxiosInstance; 108 | 109 | constructor(baseURL: string, apiKey: string) { 110 | this.axiosClient = axios.create({ 111 | baseURL, 112 | headers: { 113 | 'X-API-Key': apiKey, 114 | 'Content-Type': 'application/json', 115 | }, 116 | timeout: 120000, 117 | }); 118 | } 119 | 120 | async getMarkdown(options: MarkdownEndpointOptions): Promise<MarkdownEndpointResponse> { 121 | // Validate URL 122 | if (!validateURL(options.url)) { 123 | throw new Error('Invalid URL format'); 124 | } 125 | 126 | try { 127 | const response = await this.axiosClient.post('/md', { 128 | url: options.url, 129 | f: options.f, 130 | q: options.q, 131 | c: options.c, 132 | }); 133 | 134 | return response.data; 135 | } catch (error) { 136 | return handleAxiosError(error); 137 | } 138 | } 139 | 140 | async captureScreenshot(options: ScreenshotEndpointOptions): Promise<ScreenshotEndpointResponse> { 141 | // Validate URL 142 | if (!validateURL(options.url)) { 143 | throw new Error('Invalid URL format'); 144 | } 145 | 146 | try { 147 | const response = await this.axiosClient.post('/screenshot', { 148 | url: options.url, 149 | screenshot_wait_for: options.screenshot_wait_for, 150 | // output_path is omitted to get base64 response 151 | }); 152 | 153 | return response.data; 154 | } catch (error) { 155 | return handleAxiosError(error); 156 | } 157 | } 158 | 159 | async generatePDF(options: PDFEndpointOptions): Promise<PDFEndpointResponse> { 160 | // Validate URL 161 | if (!validateURL(options.url)) { 162 | throw new Error('Invalid URL format'); 163 | } 164 | 165 | try { 166 | const response = await this.axiosClient.post('/pdf', { 167 | url: options.url, 168 | // output_path is omitted to get base64 response 169 | }); 170 | 171 | return response.data; 172 | } catch (error) { 173 | return handleAxiosError(error); 174 | } 175 | } 176 | 177 | async executeJS(options: JSExecuteEndpointOptions): Promise<JSExecuteEndpointResponse> { 178 | // Validate URL 179 | if (!validateURL(options.url)) { 180 | throw new Error('Invalid URL format'); 181 | } 182 | 183 | // Ensure scripts is always an array 184 | const scripts = Array.isArray(options.scripts) ? options.scripts : [options.scripts]; 185 | 186 | // Validate each script 187 | for (const script of scripts) { 188 | if (!validateJavaScriptCode(script)) { 189 | throw new Error( 190 | 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', 191 | ); 192 | } 193 | } 194 | 195 | try { 196 | const response = await this.axiosClient.post('/execute_js', { 197 | url: options.url, 198 | scripts: scripts, // Always send as array 199 | // Only url and scripts are supported by the endpoint 200 | }); 201 | 202 | return response.data; 203 | } catch (error) { 204 | return handleAxiosError(error); 205 | } 206 | } 207 | 208 | async batchCrawl(options: BatchCrawlOptions) { 209 | // Validate URLs 210 | if (!options.urls || options.urls.length === 0) { 211 | throw new Error('URLs array cannot be empty'); 212 | } 213 | 214 | // Build crawler config if needed 215 | const crawler_config: Record<string, unknown> = {}; 216 | 217 | // Handle remove_images by using exclude_tags 218 | if (options.remove_images) { 219 | crawler_config.exclude_tags = ['img', 'picture', 'svg']; 220 | } 221 | 222 | if (options.bypass_cache) { 223 | crawler_config.cache_mode = 'BYPASS'; 224 | } 225 | 226 | try { 227 | const response = await this.axiosClient.post('/crawl', { 228 | urls: options.urls, 229 | max_concurrent: options.max_concurrent, 230 | crawler_config: Object.keys(crawler_config).length > 0 ? crawler_config : undefined, 231 | }); 232 | 233 | return response.data; 234 | } catch (error) { 235 | return handleAxiosError(error); 236 | } 237 | } 238 | 239 | async getHTML(options: HTMLEndpointOptions): Promise<HTMLEndpointResponse> { 240 | // Validate URL 241 | if (!validateURL(options.url)) { 242 | throw new Error('Invalid URL format'); 243 | } 244 | 245 | try { 246 | const response = await this.axiosClient.post('/html', { 247 | url: options.url, 248 | // Only url is supported by the endpoint 249 | }); 250 | 251 | return response.data; 252 | } catch (error) { 253 | return handleAxiosError(error); 254 | } 255 | } 256 | 257 | async parseSitemap(url: string) { 258 | try { 259 | // Use axios directly without baseURL for fetching external URLs 260 | const response = await axios.get(url); 261 | return response.data; 262 | } catch (error) { 263 | return handleAxiosError(error); 264 | } 265 | } 266 | 267 | async detectContentType(url: string): Promise<string> { 268 | try { 269 | // Use axios directly without baseURL for external URLs 270 | const response = await axios.head(url); 271 | return response.headers['content-type'] || ''; 272 | } catch { 273 | return ''; 274 | } 275 | } 276 | 277 | async crawl(options: AdvancedCrawlConfig): Promise<CrawlEndpointResponse> { 278 | // Validate JS code if present 279 | if (options.crawler_config?.js_code) { 280 | const scripts = Array.isArray(options.crawler_config.js_code) 281 | ? options.crawler_config.js_code 282 | : [options.crawler_config.js_code]; 283 | 284 | for (const script of scripts) { 285 | if (!validateJavaScriptCode(script)) { 286 | throw new Error( 287 | 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', 288 | ); 289 | } 290 | } 291 | } 292 | 293 | // Server only accepts urls array, not url string 294 | const urls = options.url ? [options.url] : options.urls || []; 295 | 296 | const requestBody: CrawlEndpointOptions & { 297 | extraction_strategy?: unknown; 298 | table_extraction_strategy?: unknown; 299 | markdown_generator_options?: unknown; 300 | } = { 301 | urls, 302 | browser_config: options.browser_config, 303 | crawler_config: options.crawler_config || {}, // Always include crawler_config, even if empty 304 | }; 305 | 306 | // Add extraction strategy passthrough fields if present 307 | if (options.extraction_strategy) { 308 | requestBody.extraction_strategy = options.extraction_strategy; 309 | } 310 | if (options.table_extraction_strategy) { 311 | requestBody.table_extraction_strategy = options.table_extraction_strategy; 312 | } 313 | if (options.markdown_generator_options) { 314 | requestBody.markdown_generator_options = options.markdown_generator_options; 315 | } 316 | 317 | try { 318 | const response = await this.axiosClient.post('/crawl', requestBody); 319 | return response.data; 320 | } catch (error) { 321 | return handleAxiosError(error); 322 | } 323 | } 324 | 325 | async extractWithLLM(options: LLMEndpointOptions): Promise<LLMEndpointResponse> { 326 | // Validate URL 327 | if (!validateURL(options.url)) { 328 | throw new Error('Invalid URL format'); 329 | } 330 | 331 | try { 332 | const encodedUrl = encodeURIComponent(options.url); 333 | const encodedQuery = encodeURIComponent(options.query); 334 | const response = await this.axiosClient.get(`/llm/${encodedUrl}?q=${encodedQuery}`); 335 | return response.data; 336 | } catch (error) { 337 | // Special handling for LLM-specific errors 338 | if (axios.isAxiosError(error)) { 339 | const axiosError = error as AxiosError; 340 | if (axiosError.code === 'ECONNABORTED' || axiosError.response?.status === 504) { 341 | throw new Error('LLM extraction timed out. Try a simpler query or different URL.'); 342 | } 343 | if (axiosError.response?.status === 401) { 344 | throw new Error( 345 | 'LLM extraction failed: No LLM provider configured on server. Please ensure the server has an API key set.', 346 | ); 347 | } 348 | } 349 | return handleAxiosError(error); 350 | } 351 | } 352 | } 353 | ``` -------------------------------------------------------------------------------- /src/__tests__/request-handler.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { jest } from '@jest/globals'; 2 | 3 | // Mock all dependencies before imports 4 | const mockGetMarkdown = jest.fn(); 5 | const mockCaptureScreenshot = jest.fn(); 6 | const mockGeneratePDF = jest.fn(); 7 | const mockExecuteJS = jest.fn(); 8 | const mockGetHTML = jest.fn(); 9 | const mockBatchCrawl = jest.fn(); 10 | const mockExtractWithLLM = jest.fn(); 11 | const mockCrawl = jest.fn(); 12 | const mockParseSitemap = jest.fn(); 13 | 14 | jest.unstable_mockModule('../crawl4ai-service.js', () => ({ 15 | Crawl4AIService: jest.fn().mockImplementation(() => ({ 16 | getMarkdown: mockGetMarkdown, 17 | captureScreenshot: mockCaptureScreenshot, 18 | generatePDF: mockGeneratePDF, 19 | executeJS: mockExecuteJS, 20 | getHTML: mockGetHTML, 21 | batchCrawl: mockBatchCrawl, 22 | extractWithLLM: mockExtractWithLLM, 23 | crawl: mockCrawl, 24 | parseSitemap: mockParseSitemap, 25 | })), 26 | })); 27 | 28 | // Mock axios 29 | const mockPost = jest.fn(); 30 | const mockAxiosCreate = jest.fn(() => ({ 31 | post: mockPost, 32 | })); 33 | 34 | jest.unstable_mockModule('axios', () => ({ 35 | default: { 36 | create: mockAxiosCreate, 37 | }, 38 | })); 39 | 40 | // Mock MCP SDK 41 | const mockSetRequestHandler = jest.fn(); 42 | const mockTool = jest.fn(); 43 | const mockConnect = jest.fn(); 44 | 45 | jest.unstable_mockModule('@modelcontextprotocol/sdk/server/index.js', () => ({ 46 | Server: jest.fn().mockImplementation(() => ({ 47 | setRequestHandler: mockSetRequestHandler, 48 | tool: mockTool, 49 | connect: mockConnect, 50 | })), 51 | })); 52 | 53 | // Mock the types module that exports the schemas 54 | const CallToolRequestSchema = { method: 'tools/call' }; 55 | const ListToolsRequestSchema = { method: 'tools/list' }; 56 | 57 | jest.unstable_mockModule('@modelcontextprotocol/sdk/types.js', () => ({ 58 | CallToolRequestSchema, 59 | ListToolsRequestSchema, 60 | })); 61 | 62 | jest.unstable_mockModule('@modelcontextprotocol/sdk/server/stdio.js', () => ({ 63 | StdioServerTransport: jest.fn(), 64 | })); 65 | 66 | // Now import the server after mocks are set up 67 | const { Crawl4AIServer } = await import('../server.js'); 68 | 69 | // Removed unused type definitions - using 'any' for test mocks 70 | 71 | describe('MCP Request Handler Direct Testing', () => { 72 | let server: any; // eslint-disable-line @typescript-eslint/no-explicit-any 73 | let requestHandler: any; // eslint-disable-line @typescript-eslint/no-explicit-any 74 | 75 | beforeEach(async () => { 76 | jest.clearAllMocks(); 77 | 78 | // Set up mock responses 79 | mockGetMarkdown.mockResolvedValue({ success: true, content: 'markdown content' }); 80 | mockCaptureScreenshot.mockResolvedValue({ success: true, screenshot: 'base64image' }); 81 | mockGeneratePDF.mockResolvedValue({ success: true, pdf: 'base64pdf' }); 82 | mockExecuteJS.mockResolvedValue({ js_execution_result: { results: [42] } }); 83 | mockGetHTML.mockResolvedValue({ success: true, html: '<html></html>' }); 84 | mockExtractWithLLM.mockResolvedValue({ answer: 'extracted answer' }); 85 | mockCrawl.mockResolvedValue({ 86 | success: true, 87 | results: [ 88 | { 89 | url: 'https://example.com', 90 | markdown: { raw_markdown: 'content' }, 91 | success: true, 92 | status_code: 200, 93 | }, 94 | ], 95 | }); 96 | mockParseSitemap.mockResolvedValue(['https://example.com/page1']); 97 | mockPost.mockResolvedValue({ 98 | data: { 99 | results: [ 100 | { 101 | links: { internal: [], external: [] }, 102 | success: true, 103 | }, 104 | ], 105 | }, 106 | }); 107 | 108 | // Create server 109 | server = new Crawl4AIServer( 110 | process.env.CRAWL4AI_BASE_URL || 'http://test.example.com', 111 | process.env.CRAWL4AI_API_KEY || 'test-api-key', 112 | 'test-server', 113 | '1.0.0', 114 | ); 115 | await server.start(); 116 | 117 | // Get the request handler for CallToolRequestSchema 118 | const handlerCalls = mockSetRequestHandler.mock.calls; 119 | 120 | // Find the handler for CallToolRequestSchema (tools/call) 121 | for (const call of handlerCalls) { 122 | const [schema, handler] = call; 123 | if (schema && (schema as any).method === 'tools/call') { 124 | requestHandler = handler; 125 | break; 126 | } 127 | } 128 | }); 129 | 130 | describe('Tool Handler Coverage', () => { 131 | it('should handle all valid tool requests', async () => { 132 | expect(requestHandler).toBeDefined(); 133 | 134 | const validRequests = [ 135 | { name: 'get_markdown', arguments: { url: 'https://example.com' } }, 136 | { name: 'capture_screenshot', arguments: { url: 'https://example.com' } }, 137 | { name: 'generate_pdf', arguments: { url: 'https://example.com' } }, 138 | { name: 'execute_js', arguments: { url: 'https://example.com', scripts: 'return 1' } }, 139 | { name: 'batch_crawl', arguments: { urls: ['https://example.com'] } }, 140 | { name: 'smart_crawl', arguments: { url: 'https://example.com' } }, 141 | { name: 'get_html', arguments: { url: 'https://example.com' } }, 142 | { name: 'extract_links', arguments: { url: 'https://example.com' } }, 143 | { name: 'crawl_recursive', arguments: { url: 'https://example.com' } }, 144 | { name: 'parse_sitemap', arguments: { url: 'https://example.com/sitemap.xml' } }, 145 | { name: 'crawl', arguments: { url: 'https://example.com' } }, 146 | { name: 'manage_session', arguments: { action: 'create' } }, 147 | { name: 'manage_session', arguments: { action: 'clear', session_id: 'test' } }, 148 | { name: 'manage_session', arguments: { action: 'list' } }, 149 | { name: 'extract_with_llm', arguments: { url: 'https://example.com', prompt: 'test' } }, 150 | ]; 151 | 152 | for (const req of validRequests) { 153 | const result = await requestHandler({ 154 | method: 'tools/call', 155 | params: req, 156 | }); 157 | expect(result).toBeDefined(); 158 | expect(result.content).toBeDefined(); 159 | } 160 | }); 161 | 162 | it('should handle all validation error cases', async () => { 163 | const invalidRequests = [ 164 | { name: 'get_markdown', arguments: {}, expectedError: 'Invalid parameters for get_markdown' }, 165 | { name: 'capture_screenshot', arguments: {}, expectedError: 'Invalid parameters for capture_screenshot' }, 166 | { name: 'generate_pdf', arguments: {}, expectedError: 'Invalid parameters for generate_pdf' }, 167 | { 168 | name: 'execute_js', 169 | arguments: { url: 'https://example.com' }, 170 | expectedError: 'Invalid parameters for execute_js', 171 | }, 172 | { name: 'batch_crawl', arguments: {}, expectedError: 'Invalid parameters for batch_crawl' }, 173 | { name: 'smart_crawl', arguments: {}, expectedError: 'Invalid parameters for smart_crawl' }, 174 | { name: 'get_html', arguments: {}, expectedError: 'Invalid parameters for get_html' }, 175 | { name: 'extract_links', arguments: {}, expectedError: 'Invalid parameters for extract_links' }, 176 | { name: 'crawl_recursive', arguments: {}, expectedError: 'Invalid parameters for crawl_recursive' }, 177 | { name: 'parse_sitemap', arguments: {}, expectedError: 'Invalid parameters for parse_sitemap' }, 178 | { name: 'crawl', arguments: {}, expectedError: 'Invalid parameters for crawl' }, 179 | { name: 'manage_session', arguments: {}, expectedError: 'Invalid parameters for manage_session' }, 180 | { 181 | name: 'manage_session', 182 | arguments: { action: 'clear' }, 183 | expectedError: 'Invalid parameters for manage_session', 184 | }, 185 | { 186 | name: 'extract_with_llm', 187 | arguments: { url: 'https://example.com' }, 188 | expectedError: 'Invalid parameters for extract_with_llm', 189 | }, 190 | ]; 191 | 192 | for (const req of invalidRequests) { 193 | const result = await requestHandler({ 194 | method: 'tools/call', 195 | params: req, 196 | }); 197 | expect(result.content[0].text).toContain(req.expectedError); 198 | } 199 | }); 200 | 201 | it('should handle unknown tool', async () => { 202 | const result = await requestHandler({ 203 | method: 'tools/call', 204 | params: { 205 | name: 'unknown_tool', 206 | arguments: {}, 207 | }, 208 | }); 209 | expect(result.content[0].text).toContain('Error: Unknown tool: unknown_tool'); 210 | }); 211 | 212 | it('should handle non-ZodError exceptions', async () => { 213 | // Make the service method throw a non-Zod error 214 | mockGetMarkdown.mockRejectedValue(new Error('Service error')); 215 | 216 | const result = await requestHandler({ 217 | method: 'tools/call', 218 | params: { 219 | name: 'get_markdown', 220 | arguments: { url: 'https://example.com' }, 221 | }, 222 | }); 223 | 224 | expect(result.content[0].text).toContain('Error: Failed to get markdown: Service error'); 225 | }); 226 | 227 | it('should handle manage_session with create action', async () => { 228 | const result = await requestHandler({ 229 | method: 'tools/call', 230 | params: { 231 | name: 'manage_session', 232 | arguments: { 233 | action: 'create', 234 | session_id: 'test-session', 235 | initial_url: 'https://example.com', 236 | }, 237 | }, 238 | }); 239 | 240 | expect(result.content[0].text).toContain('Session created successfully'); 241 | expect(result.content[0].text).toContain('test-session'); 242 | }); 243 | 244 | it('should handle manage_session with clear action', async () => { 245 | // First create a session 246 | await requestHandler({ 247 | method: 'tools/call', 248 | params: { 249 | name: 'manage_session', 250 | arguments: { 251 | action: 'create', 252 | session_id: 'test-to-clear', 253 | }, 254 | }, 255 | }); 256 | 257 | // Then clear it 258 | const result = await requestHandler({ 259 | method: 'tools/call', 260 | params: { 261 | name: 'manage_session', 262 | arguments: { 263 | action: 'clear', 264 | session_id: 'test-to-clear', 265 | }, 266 | }, 267 | }); 268 | 269 | expect(result.content[0].text).toContain('Session cleared successfully'); 270 | }); 271 | 272 | it('should handle manage_session with list action', async () => { 273 | // First create a session 274 | await requestHandler({ 275 | method: 'tools/call', 276 | params: { 277 | name: 'manage_session', 278 | arguments: { 279 | action: 'create', 280 | session_id: 'test-list-session', 281 | }, 282 | }, 283 | }); 284 | 285 | // List sessions 286 | const result = await requestHandler({ 287 | method: 'tools/call', 288 | params: { 289 | name: 'manage_session', 290 | arguments: { action: 'list' }, 291 | }, 292 | }); 293 | 294 | expect(result.content[0].text).toContain('Active sessions'); 295 | expect(result.content[0].text).toContain('test-list-session'); 296 | }); 297 | }); 298 | }); 299 | ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/screenshot-saving.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { jest } from '@jest/globals'; 2 | 3 | // Mock fs/promises 4 | const mockMkdir = jest.fn(); 5 | const mockWriteFile = jest.fn(); 6 | 7 | jest.unstable_mockModule('fs/promises', () => ({ 8 | mkdir: mockMkdir, 9 | writeFile: mockWriteFile, 10 | })); 11 | 12 | // Mock os 13 | const mockHomedir = jest.fn(); 14 | jest.unstable_mockModule('os', () => ({ 15 | homedir: mockHomedir, 16 | })); 17 | 18 | // Import after mocking 19 | const { ContentHandlers } = await import('../../handlers/content-handlers.js'); 20 | const { CrawlHandlers } = await import('../../handlers/crawl-handlers.js'); 21 | 22 | // Mock the service 23 | const mockService = { 24 | captureScreenshot: jest.fn(), 25 | crawl: jest.fn(), 26 | }; 27 | 28 | // Mock axios client 29 | const mockAxiosClient = { 30 | post: jest.fn(), 31 | }; 32 | 33 | describe('Screenshot Local Saving', () => { 34 | let contentHandlers: InstanceType<typeof ContentHandlers>; 35 | let crawlHandlers: InstanceType<typeof CrawlHandlers>; 36 | const testScreenshotBase64 = 37 | 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=='; // 1x1 red pixel 38 | 39 | beforeEach(() => { 40 | jest.clearAllMocks(); 41 | contentHandlers = new ContentHandlers(mockService as never, mockAxiosClient as never, new Map()); 42 | crawlHandlers = new CrawlHandlers(mockService as never, mockAxiosClient as never, new Map()); 43 | 44 | // Default mock implementations 45 | mockMkdir.mockResolvedValue(undefined); 46 | mockWriteFile.mockResolvedValue(undefined); 47 | }); 48 | 49 | describe('ContentHandlers.captureScreenshot', () => { 50 | it('should save screenshot to local directory when save_to_directory is provided', async () => { 51 | const mockDate = new Date('2024-01-15T10:30:00Z'); 52 | jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); 53 | 54 | mockService.captureScreenshot.mockResolvedValue({ 55 | success: true, 56 | screenshot: testScreenshotBase64, 57 | }); 58 | 59 | const result = await contentHandlers.captureScreenshot({ 60 | url: 'https://example.com', 61 | save_to_directory: '/tmp/screenshots', 62 | }); 63 | 64 | // Verify directory creation 65 | expect(mockMkdir).toHaveBeenCalledWith('/tmp/screenshots', { recursive: true }); 66 | 67 | // Verify file write 68 | const expectedFilename = 'example-com-2024-01-15T10-30-00.png'; 69 | const expectedPath = '/tmp/screenshots/' + expectedFilename; 70 | expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); 71 | 72 | // Verify response includes saved path 73 | expect(result.content[1].text).toContain(`Saved to: ${expectedPath}`); 74 | }); 75 | 76 | it('should handle directory creation failure gracefully', async () => { 77 | const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation(); 78 | mockMkdir.mockRejectedValue(new Error('Permission denied')); 79 | 80 | mockService.captureScreenshot.mockResolvedValue({ 81 | success: true, 82 | screenshot: testScreenshotBase64, 83 | }); 84 | 85 | const result = await contentHandlers.captureScreenshot({ 86 | url: 'https://example.com', 87 | save_to_directory: '/root/screenshots', 88 | }); 89 | 90 | // Should still return the screenshot 91 | expect(result.content[0].type).toBe('image'); 92 | expect(result.content[0].data).toBe(testScreenshotBase64); 93 | 94 | // Should not include saved path in text 95 | expect(result.content[1].text).not.toContain('Saved to:'); 96 | 97 | // Should log error 98 | expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to save screenshot locally:', expect.any(Error)); 99 | 100 | consoleErrorSpy.mockRestore(); 101 | }); 102 | 103 | it('should handle file path instead of directory path', async () => { 104 | const mockDate = new Date('2024-01-15T10:30:00Z'); 105 | jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); 106 | const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(); 107 | 108 | mockService.captureScreenshot.mockResolvedValue({ 109 | success: true, 110 | screenshot: testScreenshotBase64, 111 | }); 112 | 113 | await contentHandlers.captureScreenshot({ 114 | url: 'https://example.com', 115 | save_to_directory: '/tmp/screenshots/screenshot.png', 116 | }); 117 | 118 | // Should warn about file path 119 | expect(consoleWarnSpy).toHaveBeenCalledWith( 120 | 'Warning: save_to_directory should be a directory path, not a file path. Using parent directory.', 121 | ); 122 | 123 | // Should use parent directory 124 | expect(mockMkdir).toHaveBeenCalledWith('/tmp/screenshots', { recursive: true }); 125 | 126 | // Should still generate filename 127 | const expectedFilename = 'example-com-2024-01-15T10-30-00.png'; 128 | const expectedPath = '/tmp/screenshots/' + expectedFilename; 129 | expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); 130 | 131 | consoleWarnSpy.mockRestore(); 132 | }); 133 | 134 | it('should resolve home directory paths', async () => { 135 | const mockDate = new Date('2024-01-15T10:30:00Z'); 136 | jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); 137 | mockHomedir.mockReturnValue('/Users/testuser'); 138 | 139 | mockService.captureScreenshot.mockResolvedValue({ 140 | success: true, 141 | screenshot: testScreenshotBase64, 142 | }); 143 | 144 | await contentHandlers.captureScreenshot({ 145 | url: 'https://example.com', 146 | save_to_directory: '~/Desktop/screenshots', 147 | }); 148 | 149 | // Should resolve ~ to home directory 150 | expect(mockMkdir).toHaveBeenCalledWith('/Users/testuser/Desktop/screenshots', { recursive: true }); 151 | 152 | const expectedPath = '/Users/testuser/Desktop/screenshots/example-com-2024-01-15T10-30-00.png'; 153 | expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); 154 | }); 155 | 156 | it('should not return large screenshots when saved locally', async () => { 157 | // Create a large fake screenshot (>800KB when decoded) 158 | const largeBase64 = 'A'.repeat(1200000); // ~900KB when decoded 159 | 160 | mockService.captureScreenshot.mockResolvedValue({ 161 | success: true, 162 | screenshot: largeBase64, 163 | }); 164 | 165 | const result = await contentHandlers.captureScreenshot({ 166 | url: 'https://example.com', 167 | save_to_directory: '/tmp', 168 | }); 169 | 170 | // Should not include image in response 171 | const imageContent = result.content.find((c) => c.type === 'image'); 172 | expect(imageContent).toBeUndefined(); 173 | 174 | // Should mention size in text 175 | const textContent = result.content.find((c) => c.type === 'text'); 176 | expect(textContent?.text).toContain('not returned due to size'); 177 | expect(textContent?.text).toContain('KB'); 178 | }); 179 | 180 | it('should sanitize filename for URLs with special characters', async () => { 181 | const mockDate = new Date('2024-01-15T10:30:00Z'); 182 | jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); 183 | 184 | mockService.captureScreenshot.mockResolvedValue({ 185 | success: true, 186 | screenshot: testScreenshotBase64, 187 | }); 188 | 189 | await contentHandlers.captureScreenshot({ 190 | url: 'https://my-site.com:8080/path?query=value', 191 | save_to_directory: '/tmp/screenshots', 192 | }); 193 | 194 | const expectedFilename = 'my-site-com-2024-01-15T10-30-00.png'; 195 | const expectedPath = '/tmp/screenshots/' + expectedFilename; 196 | expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, expect.any(Buffer)); 197 | }); 198 | }); 199 | 200 | describe('CrawlHandlers.crawl', () => { 201 | it('should save screenshot to local directory when screenshot_directory is provided', async () => { 202 | const mockDate = new Date('2024-01-15T10:30:00Z'); 203 | jest.spyOn(globalThis, 'Date').mockImplementation(() => mockDate as never); 204 | 205 | mockService.crawl.mockResolvedValue({ 206 | results: [ 207 | { 208 | url: 'https://example.com', 209 | success: true, 210 | screenshot: testScreenshotBase64, 211 | markdown: { raw_markdown: 'Test content' }, 212 | }, 213 | ], 214 | }); 215 | 216 | const result = await crawlHandlers.crawl({ 217 | url: 'https://example.com', 218 | screenshot: true, 219 | screenshot_directory: '/tmp/crawl-screenshots', 220 | }); 221 | 222 | // Verify directory creation 223 | expect(mockMkdir).toHaveBeenCalledWith('/tmp/crawl-screenshots', { recursive: true }); 224 | 225 | // Verify file write 226 | const expectedFilename = 'example-com-2024-01-15T10-30-00.png'; 227 | const expectedPath = '/tmp/crawl-screenshots/' + expectedFilename; 228 | expect(mockWriteFile).toHaveBeenCalledWith(expectedPath, Buffer.from(testScreenshotBase64, 'base64')); 229 | 230 | // Verify response includes saved path 231 | const textContent = result.content.find( 232 | (c) => c.type === 'text' && 'text' in c && c.text?.includes('Screenshot saved'), 233 | ); 234 | expect(textContent?.text).toContain(`Screenshot saved to: ${expectedPath}`); 235 | }); 236 | 237 | it('should handle file save failure gracefully in crawl', async () => { 238 | const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation(); 239 | mockMkdir.mockResolvedValue(undefined); // directory creation succeeds 240 | mockWriteFile.mockRejectedValue(new Error('Disk full')); // but file write fails 241 | 242 | mockService.crawl.mockResolvedValue({ 243 | results: [ 244 | { 245 | url: 'https://example.com', 246 | success: true, 247 | screenshot: testScreenshotBase64, 248 | markdown: { raw_markdown: 'Test content' }, 249 | }, 250 | ], 251 | }); 252 | 253 | const result = await crawlHandlers.crawl({ 254 | url: 'https://example.com', 255 | screenshot: true, 256 | screenshot_directory: '/tmp/crawl-screenshots', 257 | }); 258 | 259 | // Should still return the screenshot as image 260 | const imageContent = result.content.find((c) => c.type === 'image'); 261 | expect(imageContent?.data).toBe(testScreenshotBase64); 262 | 263 | // Should log error 264 | expect(consoleErrorSpy).toHaveBeenCalledWith('Failed to save screenshot locally:', expect.any(Error)); 265 | 266 | consoleErrorSpy.mockRestore(); 267 | }); 268 | 269 | it('should not attempt to save when screenshot_directory is not provided', async () => { 270 | mockService.crawl.mockResolvedValue({ 271 | results: [ 272 | { 273 | url: 'https://example.com', 274 | success: true, 275 | screenshot: testScreenshotBase64, 276 | markdown: { raw_markdown: 'Test content' }, 277 | }, 278 | ], 279 | }); 280 | 281 | await crawlHandlers.crawl({ 282 | url: 'https://example.com', 283 | screenshot: true, 284 | }); 285 | 286 | // Should not call fs methods 287 | expect(mockMkdir).not.toHaveBeenCalled(); 288 | expect(mockWriteFile).not.toHaveBeenCalled(); 289 | }); 290 | }); 291 | }); 292 | ``` -------------------------------------------------------------------------------- /src/__tests__/crawl4ai-service.network.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { jest } from '@jest/globals'; 2 | 3 | // Mock axios before importing the service 4 | const mockAxiosInstance = { 5 | get: jest.fn() as jest.Mock, 6 | post: jest.fn() as jest.Mock, 7 | interceptors: { 8 | request: { use: jest.fn() as jest.Mock }, 9 | response: { use: jest.fn() as jest.Mock }, 10 | }, 11 | }; 12 | 13 | jest.unstable_mockModule('axios', () => ({ 14 | default: { 15 | create: jest.fn(() => mockAxiosInstance), 16 | isAxiosError: jest.fn((error: any) => error.isAxiosError === true), // eslint-disable-line @typescript-eslint/no-explicit-any 17 | get: jest.fn(), 18 | head: jest.fn(), 19 | }, 20 | isAxiosError: jest.fn((error: any) => error.isAxiosError === true), // eslint-disable-line @typescript-eslint/no-explicit-any 21 | })); 22 | 23 | // Import after mocking 24 | const { Crawl4AIService } = await import('../crawl4ai-service.js'); 25 | 26 | describe('Crawl4AI Service - Network Failures', () => { 27 | let service: any; // eslint-disable-line @typescript-eslint/no-explicit-any 28 | 29 | interface ErrorWithCode extends Error { 30 | code?: string; 31 | response?: { 32 | status: number; 33 | data?: any; // eslint-disable-line @typescript-eslint/no-explicit-any 34 | }; 35 | isAxiosError?: boolean; 36 | } 37 | 38 | beforeEach(() => { 39 | jest.clearAllMocks(); 40 | service = new Crawl4AIService('http://localhost:11235', 'test-api-key'); 41 | }); 42 | 43 | describe('Network Timeouts', () => { 44 | it('should handle request timeout', async () => { 45 | const timeoutError = new Error('timeout of 30000ms exceeded') as ErrorWithCode; 46 | timeoutError.code = 'ECONNABORTED'; 47 | timeoutError.isAxiosError = true; 48 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(timeoutError); 49 | 50 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow('Request timed out'); 51 | }); 52 | 53 | it('should handle response timeout', async () => { 54 | const timeoutError = new Error('timeout of 30000ms exceeded') as ErrorWithCode; 55 | timeoutError.code = 'ETIMEDOUT'; 56 | timeoutError.isAxiosError = true; 57 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(timeoutError); 58 | 59 | await expect(service.getHTML({ url: 'https://example.com' })).rejects.toThrow('Request timeout'); 60 | }); 61 | }); 62 | 63 | describe('HTTP Error Responses', () => { 64 | it('should handle 401 Unauthorized', async () => { 65 | const error = { 66 | response: { 67 | status: 401, 68 | data: { error: 'Invalid API key' }, 69 | }, 70 | isAxiosError: true, 71 | }; 72 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 73 | 74 | await expect(service.crawl({ urls: ['https://example.com'] })).rejects.toThrow( 75 | 'Request failed with status 401: Invalid API key', 76 | ); 77 | }); 78 | 79 | it('should handle 403 Forbidden', async () => { 80 | const error = { 81 | response: { 82 | status: 403, 83 | data: { error: 'Access denied' }, 84 | }, 85 | isAxiosError: true, 86 | }; 87 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 88 | 89 | await expect(service.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 90 | 'Request failed with status 403: Access denied', 91 | ); 92 | }); 93 | 94 | it('should handle 404 Not Found', async () => { 95 | const error = { 96 | response: { 97 | status: 404, 98 | data: { error: 'Endpoint not found' }, 99 | }, 100 | isAxiosError: true, 101 | }; 102 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 103 | 104 | await expect(service.generatePDF({ url: 'https://example.com' })).rejects.toThrow( 105 | 'Request failed with status 404: Endpoint not found', 106 | ); 107 | }); 108 | 109 | it('should handle 429 Too Many Requests', async () => { 110 | const error = { 111 | response: { 112 | status: 429, 113 | data: { error: 'Rate limit exceeded' }, 114 | headers: { 115 | 'retry-after': '60', 116 | }, 117 | }, 118 | isAxiosError: true, 119 | }; 120 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 121 | 122 | await expect(service.executeJS({ url: 'https://example.com', scripts: ['return 1;'] })).rejects.toThrow( 123 | 'Request failed with status 429: Rate limit exceeded', 124 | ); 125 | }); 126 | 127 | it('should handle 500 Internal Server Error', async () => { 128 | const error = { 129 | response: { 130 | status: 500, 131 | data: { error: 'Internal server error' }, 132 | }, 133 | isAxiosError: true, 134 | }; 135 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 136 | 137 | await expect(service.crawl({ urls: ['https://example.com'] })).rejects.toThrow( 138 | 'Request failed with status 500: Internal server error', 139 | ); 140 | }); 141 | 142 | it('should handle 502 Bad Gateway', async () => { 143 | const error = { 144 | response: { 145 | status: 502, 146 | data: 'Bad Gateway', 147 | }, 148 | isAxiosError: true, 149 | message: 'Request failed with status code 502', 150 | }; 151 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 152 | 153 | await expect(service.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 154 | 'Request failed with status 502: Request failed with status code 502', 155 | ); 156 | }); 157 | 158 | it('should handle 503 Service Unavailable', async () => { 159 | const error = { 160 | response: { 161 | status: 503, 162 | data: { error: 'Service temporarily unavailable' }, 163 | }, 164 | isAxiosError: true, 165 | }; 166 | (mockAxiosInstance.get as jest.Mock).mockRejectedValue(error); 167 | 168 | await expect(service.extractWithLLM({ url: 'https://example.com', query: 'test' })).rejects.toThrow( 169 | 'Request failed with status 503: Service temporarily unavailable', 170 | ); 171 | }); 172 | 173 | it('should handle 504 Gateway Timeout', async () => { 174 | const error = { 175 | response: { 176 | status: 504, 177 | data: { error: 'Gateway timeout' }, 178 | }, 179 | isAxiosError: true, 180 | }; 181 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 182 | 183 | await expect(service.getHTML({ url: 'https://example.com' })).rejects.toThrow( 184 | 'Request failed with status 504: Gateway timeout', 185 | ); 186 | }); 187 | }); 188 | 189 | describe('Network Connection Failures', () => { 190 | it('should handle DNS resolution failure', async () => { 191 | const error = new Error('getaddrinfo ENOTFOUND invalid.domain') as ErrorWithCode; 192 | error.code = 'ENOTFOUND'; 193 | error.isAxiosError = true; 194 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 195 | 196 | await expect(service.getMarkdown({ url: 'https://invalid.domain' })).rejects.toThrow( 197 | 'DNS resolution failed: getaddrinfo ENOTFOUND invalid.domain', 198 | ); 199 | }); 200 | 201 | it('should handle connection refused', async () => { 202 | const error = new Error('connect ECONNREFUSED 127.0.0.1:11235') as ErrorWithCode; 203 | error.code = 'ECONNREFUSED'; 204 | error.isAxiosError = true; 205 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 206 | 207 | await expect(service.crawl({ urls: ['https://example.com'] })).rejects.toThrow( 208 | 'Connection refused: connect ECONNREFUSED 127.0.0.1:11235', 209 | ); 210 | }); 211 | 212 | it('should handle connection reset', async () => { 213 | const error = new Error('socket hang up') as ErrorWithCode; 214 | error.code = 'ECONNRESET'; 215 | error.isAxiosError = true; 216 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 217 | 218 | await expect(service.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 219 | 'Connection reset: socket hang up', 220 | ); 221 | }); 222 | 223 | it('should handle network unreachable', async () => { 224 | const error = new Error('connect ENETUNREACH') as ErrorWithCode; 225 | error.code = 'ENETUNREACH'; 226 | error.isAxiosError = true; 227 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 228 | 229 | await expect(service.executeJS({ url: 'https://example.com', scripts: ['return 1;'] })).rejects.toThrow( 230 | 'Network unreachable: connect ENETUNREACH', 231 | ); 232 | }); 233 | }); 234 | 235 | describe('Response Parsing Failures', () => { 236 | it('should handle invalid JSON response', async () => { 237 | // This test is not applicable anymore since we handle errors at axios level 238 | // The service will return whatever axios returns 239 | (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ 240 | data: '<html>Not JSON</html>', 241 | headers: { 'content-type': 'text/html' }, 242 | }); 243 | 244 | const result = await service.getHTML({ url: 'https://example.com' }); 245 | expect(result).toBe('<html>Not JSON</html>'); 246 | }); 247 | 248 | it('should handle empty response', async () => { 249 | (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ 250 | data: null, 251 | }); 252 | 253 | // The service returns null, which is valid 254 | const result = await service.crawl({ urls: ['https://example.com'] }); 255 | expect(result).toBeNull(); 256 | }); 257 | 258 | it('should handle malformed response structure', async () => { 259 | (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ 260 | data: { unexpected: 'structure' }, 261 | }); 262 | 263 | // The service returns whatever the API returns 264 | const result = await service.crawl({ urls: ['https://example.com'] }); 265 | expect(result).toEqual({ unexpected: 'structure' }); 266 | }); 267 | }); 268 | 269 | describe('Request Configuration Errors', () => { 270 | it('should handle invalid URL format', async () => { 271 | await expect(service.getMarkdown({ url: 'not-a-valid-url' })).rejects.toThrow('Invalid URL format'); 272 | }); 273 | 274 | it('should handle missing required parameters', async () => { 275 | await expect(service.batchCrawl({ urls: [] })).rejects.toThrow('URLs array cannot be empty'); 276 | }); 277 | 278 | it('should handle oversized request payload', async () => { 279 | const error = new Error('Request Entity Too Large') as ErrorWithCode; 280 | error.response = { status: 413 }; 281 | error.isAxiosError = true; 282 | error.message = 'Request Entity Too Large'; 283 | (mockAxiosInstance.post as jest.Mock).mockRejectedValue(error); 284 | 285 | const hugeScript = 'x'.repeat(10 * 1024 * 1024); // 10MB 286 | await expect(service.executeJS({ url: 'https://example.com', scripts: [hugeScript] })).rejects.toThrow( 287 | 'Request failed with status 413: Request Entity Too Large', 288 | ); 289 | }); 290 | }); 291 | 292 | describe('Partial Response Handling', () => { 293 | it('should handle successful response with partial data', async () => { 294 | (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ 295 | data: { 296 | results: [ 297 | { success: true, url: 'https://example.com', markdown: 'Content' }, 298 | { success: false, url: 'https://example.com/page2', error: 'Failed' }, 299 | ], 300 | }, 301 | }); 302 | 303 | const result = await service.crawl({ urls: ['https://example.com', 'https://example.com/page2'] }); 304 | expect(result.results).toHaveLength(2); 305 | expect(result.results[0].success).toBe(true); 306 | expect(result.results[1].success).toBe(false); 307 | }); 308 | 309 | it('should handle response with missing optional fields', async () => { 310 | (mockAxiosInstance.post as jest.Mock).mockResolvedValue({ 311 | data: { 312 | success: true, 313 | url: 'https://example.com', 314 | // Missing markdown field 315 | }, 316 | }); 317 | 318 | const result = await service.getMarkdown({ url: 'https://example.com' }); 319 | expect(result.url).toBe('https://example.com'); 320 | expect(result.markdown).toBeUndefined(); 321 | }); 322 | }); 323 | }); 324 | ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/parameter-combinations.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { jest } from '@jest/globals'; 2 | import { CrawlHandlers } from '../../handlers/crawl-handlers.js'; 3 | import { ContentHandlers } from '../../handlers/content-handlers.js'; 4 | 5 | type MockService = { 6 | crawl: jest.Mock; 7 | getMarkdown: jest.Mock; 8 | captureScreenshot: jest.Mock; 9 | }; 10 | 11 | type MockAxiosClient = { 12 | post: jest.Mock; 13 | get: jest.Mock; 14 | head: jest.Mock; 15 | }; 16 | 17 | describe('Optional Parameter Combinations', () => { 18 | let crawlHandlers: CrawlHandlers; 19 | let _contentHandlers: ContentHandlers; 20 | let mockService: MockService; 21 | let mockAxiosClient: MockAxiosClient; 22 | 23 | beforeEach(() => { 24 | jest.clearAllMocks(); 25 | 26 | mockService = { 27 | crawl: jest.fn(), 28 | getMarkdown: jest.fn(), 29 | captureScreenshot: jest.fn(), 30 | }; 31 | 32 | mockAxiosClient = { 33 | post: jest.fn(), 34 | get: jest.fn(), 35 | head: jest.fn(), 36 | }; 37 | 38 | crawlHandlers = new CrawlHandlers(mockService, mockAxiosClient, new Map()); 39 | _contentHandlers = new ContentHandlers(mockService, mockAxiosClient, new Map()); 40 | }); 41 | 42 | describe('Batch Crawl Parameter Combinations', () => { 43 | const testCases = [ 44 | { 45 | name: 'default parameters only', 46 | options: { urls: ['https://example.com'] }, 47 | expectedConfig: undefined, 48 | }, 49 | { 50 | name: 'remove_images only', 51 | options: { urls: ['https://example.com'], remove_images: true }, 52 | expectedConfig: { exclude_tags: ['img', 'picture', 'svg'] }, 53 | }, 54 | { 55 | name: 'bypass_cache only', 56 | options: { urls: ['https://example.com'], bypass_cache: true }, 57 | expectedConfig: { cache_mode: 'BYPASS' }, 58 | }, 59 | { 60 | name: 'both remove_images and bypass_cache', 61 | options: { urls: ['https://example.com'], remove_images: true, bypass_cache: true }, 62 | expectedConfig: { exclude_tags: ['img', 'picture', 'svg'], cache_mode: 'BYPASS' }, 63 | }, 64 | { 65 | name: 'with max_concurrent', 66 | options: { urls: ['https://example.com'], max_concurrent: 5, remove_images: true }, 67 | expectedConfig: { exclude_tags: ['img', 'picture', 'svg'] }, 68 | }, 69 | ]; 70 | 71 | testCases.forEach(({ name, options, expectedConfig }) => { 72 | it(`should handle ${name}`, async () => { 73 | mockAxiosClient.post.mockResolvedValue({ 74 | data: { results: [{ success: true }] }, 75 | }); 76 | 77 | await crawlHandlers.batchCrawl(options); 78 | 79 | expect(mockAxiosClient.post).toHaveBeenCalledWith('/crawl', { 80 | urls: options.urls, 81 | max_concurrent: options.max_concurrent, 82 | crawler_config: expectedConfig, 83 | }); 84 | }); 85 | }); 86 | }); 87 | 88 | describe('Smart Crawl Parameter Combinations', () => { 89 | const testCases = [ 90 | { 91 | name: 'minimal configuration', 92 | options: { url: 'https://example.com' }, 93 | expectedCacheMode: 'ENABLED', 94 | }, 95 | { 96 | name: 'with bypass_cache', 97 | options: { url: 'https://example.com', bypass_cache: true }, 98 | expectedCacheMode: 'BYPASS', 99 | }, 100 | { 101 | name: 'with max_depth', 102 | options: { url: 'https://example.com', max_depth: 5 }, 103 | expectedCacheMode: 'ENABLED', 104 | }, 105 | { 106 | name: 'with follow_links and bypass_cache', 107 | options: { url: 'https://example.com', follow_links: true, bypass_cache: true }, 108 | expectedCacheMode: 'BYPASS', 109 | }, 110 | ]; 111 | 112 | testCases.forEach(({ name, options, expectedCacheMode }) => { 113 | it(`should handle ${name}`, async () => { 114 | mockAxiosClient.head.mockResolvedValue({ headers: { 'content-type': 'text/html' } }); 115 | mockAxiosClient.post.mockResolvedValue({ 116 | data: { results: [{ success: true, markdown: { raw_markdown: 'Content' } }] }, 117 | }); 118 | 119 | await crawlHandlers.smartCrawl(options); 120 | 121 | expect(mockAxiosClient.post).toHaveBeenCalledWith('/crawl', { 122 | urls: [options.url], 123 | crawler_config: { 124 | cache_mode: expectedCacheMode, 125 | }, 126 | browser_config: { 127 | headless: true, 128 | browser_type: 'chromium', 129 | }, 130 | }); 131 | }); 132 | }); 133 | }); 134 | 135 | describe('Crawl Parameter Combinations', () => { 136 | // Table-driven tests for various parameter combinations 137 | const parameterSets = [ 138 | // Browser configuration combinations 139 | { 140 | name: 'browser type with viewport', 141 | params: { 142 | url: 'https://example.com', 143 | browser_type: 'firefox', 144 | viewport_width: 1920, 145 | viewport_height: 1080, 146 | }, 147 | }, 148 | { 149 | name: 'proxy with authentication', 150 | params: { 151 | url: 'https://example.com', 152 | proxy_server: 'http://proxy.example.com:8080', 153 | proxy_username: 'user', 154 | proxy_password: 'pass', 155 | }, 156 | }, 157 | { 158 | name: 'cookies and headers', 159 | params: { 160 | url: 'https://example.com', 161 | cookies: [{ name: 'session', value: '123', domain: '.example.com' }], 162 | headers: { 'X-Custom': 'value', Authorization: 'Bearer token' }, 163 | }, 164 | }, 165 | // Content filtering combinations 166 | { 167 | name: 'content filtering options', 168 | params: { 169 | url: 'https://example.com', 170 | word_count_threshold: 100, 171 | excluded_tags: ['script', 'style'], 172 | remove_overlay_elements: true, 173 | }, 174 | }, 175 | { 176 | name: 'text-only with form removal', 177 | params: { 178 | url: 'https://example.com', 179 | only_text: true, 180 | remove_forms: true, 181 | keep_data_attributes: false, 182 | }, 183 | }, 184 | // JavaScript execution combinations 185 | { 186 | name: 'js_code with wait conditions', 187 | params: { 188 | url: 'https://example.com', 189 | js_code: ['document.querySelector("button").click()'], 190 | wait_for: '#result', 191 | wait_for_timeout: 5000, 192 | }, 193 | }, 194 | { 195 | name: 'js_only with session', 196 | params: { 197 | url: 'https://example.com', 198 | js_only: true, 199 | session_id: 'test-session-123', 200 | }, 201 | }, 202 | // Dynamic content handling 203 | { 204 | name: 'scrolling configuration', 205 | params: { 206 | url: 'https://example.com', 207 | delay_before_scroll: 2000, 208 | scroll_delay: 500, 209 | scan_full_page: true, 210 | }, 211 | }, 212 | { 213 | name: 'virtual scroll for infinite feeds', 214 | params: { 215 | url: 'https://example.com', 216 | virtual_scroll_config: { 217 | container_selector: '.feed', 218 | scroll_count: 10, 219 | scroll_by: 500, 220 | wait_after_scroll: 1000, 221 | }, 222 | }, 223 | }, 224 | // Media handling combinations 225 | { 226 | name: 'screenshot with PDF', 227 | params: { 228 | url: 'https://example.com', 229 | screenshot: true, 230 | screenshot_wait_for: 3, 231 | pdf: true, 232 | capture_mhtml: true, 233 | }, 234 | }, 235 | { 236 | name: 'image filtering options', 237 | params: { 238 | url: 'https://example.com', 239 | image_description_min_word_threshold: 10, 240 | image_score_threshold: 0.5, 241 | exclude_external_images: true, 242 | }, 243 | }, 244 | // Link filtering combinations 245 | { 246 | name: 'link exclusion options', 247 | params: { 248 | url: 'https://example.com', 249 | exclude_social_media_links: true, 250 | exclude_domains: ['facebook.com', 'twitter.com'], 251 | exclude_external_links: true, 252 | }, 253 | }, 254 | // Page interaction combinations 255 | { 256 | name: 'stealth mode options', 257 | params: { 258 | url: 'https://example.com', 259 | simulate_user: true, 260 | override_navigator: true, 261 | magic: true, 262 | user_agent: 'Custom Bot 1.0', 263 | }, 264 | }, 265 | // Complex combinations 266 | { 267 | name: 'kitchen sink - many options', 268 | params: { 269 | url: 'https://example.com', 270 | browser_type: 'chromium', 271 | viewport_width: 1280, 272 | viewport_height: 720, 273 | word_count_threshold: 50, 274 | excluded_tags: ['nav', 'footer'], 275 | js_code: ['window.scrollTo(0, document.body.scrollHeight)'], 276 | wait_for: '.loaded', 277 | screenshot: true, 278 | exclude_external_links: true, 279 | session_id: 'complex-session', 280 | cache_mode: 'BYPASS', 281 | verbose: true, 282 | }, 283 | }, 284 | ]; 285 | 286 | parameterSets.forEach(({ name, params }) => { 287 | it(`should correctly process ${name}`, async () => { 288 | mockService.crawl.mockResolvedValue({ 289 | results: [ 290 | { 291 | url: params.url, 292 | success: true, 293 | markdown: { raw_markdown: 'Test content' }, 294 | }, 295 | ], 296 | }); 297 | 298 | const result = await crawlHandlers.crawl(params); 299 | 300 | // Verify the service was called 301 | expect(mockService.crawl).toHaveBeenCalled(); 302 | 303 | // Verify response structure 304 | expect(result.content).toBeDefined(); 305 | expect(result.content[0].type).toBe('text'); 306 | }); 307 | }); 308 | 309 | // Test parameter validation 310 | it('should handle invalid parameter combinations', async () => { 311 | const invalidParams = { 312 | url: 'https://example.com', 313 | js_only: true, 314 | // Missing required session_id when js_only is true 315 | }; 316 | 317 | await expect(crawlHandlers.crawl(invalidParams)).rejects.toThrow(); 318 | }); 319 | 320 | // Test default values 321 | it('should apply correct defaults when parameters are omitted', async () => { 322 | mockService.crawl.mockResolvedValue({ 323 | results: [ 324 | { 325 | url: 'https://example.com', 326 | success: true, 327 | markdown: { raw_markdown: 'Content' }, 328 | }, 329 | ], 330 | }); 331 | 332 | await crawlHandlers.crawl({ url: 'https://example.com' }); 333 | 334 | const call = mockService.crawl.mock.calls[0][0]; 335 | 336 | // Check browser_config defaults 337 | expect(call.browser_config).toBeDefined(); 338 | expect(call.browser_config.headless).toBe(true); 339 | 340 | // Check that optional configs are not included when not specified 341 | expect(call.crawler_config.word_count_threshold).toBeUndefined(); 342 | expect(call.crawler_config.excluded_tags).toBeUndefined(); 343 | }); 344 | }); 345 | 346 | describe('Parameter Priority and Conflicts', () => { 347 | it('should handle conflicting cache modes correctly', async () => { 348 | mockService.crawl.mockResolvedValue({ 349 | results: [{ success: true, markdown: { raw_markdown: 'Content' } }], 350 | }); 351 | 352 | // Test that explicit cache_mode takes precedence 353 | await crawlHandlers.crawl({ 354 | url: 'https://example.com', 355 | cache_mode: 'DISABLED', 356 | // Even with other params that might suggest caching 357 | session_id: 'test-session', 358 | }); 359 | 360 | const call = mockService.crawl.mock.calls[0][0]; 361 | expect(call.crawler_config.cache_mode).toBe('DISABLED'); 362 | }); 363 | 364 | it('should handle mutually exclusive options', async () => { 365 | mockService.crawl.mockResolvedValue({ 366 | results: [{ success: true, html: '<p>HTML</p>' }], 367 | }); 368 | 369 | // only_text should override other content options 370 | await crawlHandlers.crawl({ 371 | url: 'https://example.com', 372 | only_text: true, 373 | keep_data_attributes: true, // Should be ignored with only_text 374 | }); 375 | 376 | const call = mockService.crawl.mock.calls[0][0]; 377 | expect(call.crawler_config.only_text).toBe(true); 378 | expect(call.crawler_config.keep_data_attributes).toBe(true); // Still passed through 379 | }); 380 | }); 381 | 382 | describe('Edge Cases for Optional Parameters', () => { 383 | it('should handle empty arrays correctly', async () => { 384 | mockService.crawl.mockResolvedValue({ 385 | results: [{ success: true, markdown: { raw_markdown: 'Content' } }], 386 | }); 387 | 388 | await crawlHandlers.crawl({ 389 | url: 'https://example.com', 390 | excluded_tags: [], // Empty array 391 | exclude_domains: [], // Empty array 392 | cookies: [], // Empty array 393 | }); 394 | 395 | const call = mockService.crawl.mock.calls[0][0]; 396 | expect(call.crawler_config.excluded_tags).toEqual([]); 397 | expect(call.crawler_config.exclude_domains).toEqual([]); 398 | expect(call.browser_config.cookies).toEqual([]); 399 | }); 400 | 401 | it('should handle null vs undefined correctly', async () => { 402 | mockService.crawl.mockResolvedValue({ 403 | results: [{ success: true, markdown: { raw_markdown: 'Content' } }], 404 | }); 405 | 406 | // null js_code should throw error 407 | await expect( 408 | crawlHandlers.crawl({ 409 | url: 'https://example.com', 410 | js_code: null as unknown as string[], 411 | }), 412 | ).rejects.toThrow('js_code parameter is null'); 413 | 414 | // undefined js_code should be fine 415 | await crawlHandlers.crawl({ 416 | url: 'https://example.com', 417 | js_code: undefined, 418 | }); 419 | 420 | expect(mockService.crawl).toHaveBeenCalledTimes(1); 421 | }); 422 | 423 | it('should handle boolean flags in all combinations', async () => { 424 | const booleanFlags = [ 425 | 'remove_overlay_elements', 426 | 'process_iframes', 427 | 'exclude_external_links', 428 | 'screenshot', 429 | 'pdf', 430 | 'verbose', 431 | 'log_console', 432 | 'simulate_user', 433 | 'override_navigator', 434 | 'magic', 435 | ]; 436 | 437 | // Test all flags as true 438 | const allTrue = booleanFlags.reduce((acc, flag) => ({ ...acc, [flag]: true }), { 439 | url: 'https://example.com', 440 | }); 441 | 442 | mockService.crawl.mockResolvedValue({ 443 | results: [{ success: true, markdown: { raw_markdown: 'Content' } }], 444 | }); 445 | 446 | await crawlHandlers.crawl(allTrue); 447 | 448 | const call = mockService.crawl.mock.calls[0][0]; 449 | booleanFlags.forEach((flag) => { 450 | const config = call.crawler_config[flag] || call.browser_config[flag]; 451 | expect(config).toBe(true); 452 | }); 453 | }); 454 | }); 455 | }); 456 | ``` -------------------------------------------------------------------------------- /src/__tests__/index.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | import { jest } from '@jest/globals'; 2 | import { z } from 'zod'; 3 | 4 | // Mock the MCP SDK 5 | jest.mock('@modelcontextprotocol/sdk/server/index.js'); 6 | jest.mock('@modelcontextprotocol/sdk/server/stdio.js'); 7 | 8 | describe('MCP Server Validation', () => { 9 | describe('Stateless tool validation', () => { 10 | // Test the createStatelessSchema helper 11 | const createStatelessSchema = <T extends z.ZodTypeAny>(schema: T, toolName: string) => { 12 | // Tool-specific guidance for common scenarios 13 | const toolGuidance: Record<string, string> = { 14 | capture_screenshot: 'To capture screenshots with sessions, use crawl(session_id, screenshot: true)', 15 | generate_pdf: 'To generate PDFs with sessions, use crawl(session_id, pdf: true)', 16 | execute_js: 'To run JavaScript with sessions, use crawl(session_id, js_code: [...])', 17 | get_html: 'To get HTML with sessions, use crawl(session_id)', 18 | extract_with_llm: 'To extract data with sessions, first use crawl(session_id) then extract from the response', 19 | }; 20 | 21 | const message = `${toolName} does not support session_id. This tool is stateless - each call creates a new browser. ${ 22 | toolGuidance[toolName] || 'For persistent operations, use crawl with session_id.' 23 | }`; 24 | 25 | return z 26 | .object({ 27 | session_id: z.never({ message }).optional(), 28 | }) 29 | .passthrough() 30 | .and(schema) 31 | .transform((data) => { 32 | const { session_id, ...rest } = data as Record<string, unknown> & { session_id?: unknown }; 33 | if (session_id !== undefined) { 34 | throw new Error(message); 35 | } 36 | return rest; 37 | }); 38 | }; 39 | 40 | it('should reject session_id for stateless tools', () => { 41 | const ExecuteJsSchema = createStatelessSchema( 42 | z.object({ 43 | url: z.string().url(), 44 | js_code: z.union([z.string(), z.array(z.string())]), 45 | }), 46 | 'execute_js', 47 | ); 48 | 49 | // Should reject with session_id 50 | expect(() => { 51 | ExecuteJsSchema.parse({ 52 | url: 'https://example.com', 53 | js_code: 'return document.title', 54 | session_id: 'test-session', 55 | }); 56 | }).toThrow('execute_js does not support session_id'); 57 | }); 58 | 59 | it('should accept valid parameters without session_id', () => { 60 | const ExecuteJsSchema = createStatelessSchema( 61 | z.object({ 62 | url: z.string().url(), 63 | js_code: z.union([z.string(), z.array(z.string())]), 64 | }), 65 | 'execute_js', 66 | ); 67 | 68 | const result = ExecuteJsSchema.parse({ 69 | url: 'https://example.com', 70 | js_code: 'return document.title', 71 | }); 72 | 73 | expect(result).toEqual({ 74 | url: 'https://example.com', 75 | js_code: 'return document.title', 76 | }); 77 | }); 78 | 79 | it('should provide helpful error message when session_id is used', () => { 80 | const GetMarkdownSchema = createStatelessSchema( 81 | z.object({ 82 | url: z.string().url(), 83 | }), 84 | 'get_markdown', 85 | ); 86 | 87 | try { 88 | GetMarkdownSchema.parse({ 89 | url: 'https://example.com', 90 | session_id: 'my-session', 91 | }); 92 | } catch (error) { 93 | expect(error).toBeInstanceOf(z.ZodError); 94 | const zodError = error as z.ZodError; 95 | expect(zodError.errors[0].message).toContain('get_markdown does not support session_id'); 96 | expect(zodError.errors[0].message).toContain('For persistent operations, use crawl'); 97 | } 98 | }); 99 | 100 | it('should provide tool-specific guidance for common tools', () => { 101 | // Test capture_screenshot guidance 102 | const CaptureScreenshotSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'capture_screenshot'); 103 | 104 | try { 105 | CaptureScreenshotSchema.parse({ url: 'https://example.com', session_id: 'test' }); 106 | } catch (error) { 107 | const zodError = error as z.ZodError; 108 | expect(zodError.errors[0].message).toContain('use crawl(session_id, screenshot: true)'); 109 | } 110 | 111 | // Test generate_pdf guidance 112 | const GeneratePdfSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'generate_pdf'); 113 | 114 | try { 115 | GeneratePdfSchema.parse({ url: 'https://example.com', session_id: 'test' }); 116 | } catch (error) { 117 | const zodError = error as z.ZodError; 118 | expect(zodError.errors[0].message).toContain('use crawl(session_id, pdf: true)'); 119 | } 120 | 121 | // Test execute_js guidance 122 | const ExecuteJsSchema = createStatelessSchema(z.object({ url: z.string().url() }), 'execute_js'); 123 | 124 | try { 125 | ExecuteJsSchema.parse({ url: 'https://example.com', session_id: 'test' }); 126 | } catch (error) { 127 | const zodError = error as z.ZodError; 128 | expect(zodError.errors[0].message).toContain('use crawl(session_id, js_code: [...])'); 129 | } 130 | }); 131 | 132 | it('should validate all stateless tools', () => { 133 | const statelessTools = [ 134 | 'get_markdown', 135 | 'capture_screenshot', 136 | 'generate_pdf', 137 | 'execute_js', 138 | 'batch_crawl', 139 | 'smart_crawl', 140 | 'get_html', 141 | 'extract_links', 142 | 'crawl_recursive', 143 | 'parse_sitemap', 144 | 'extract_with_llm', 145 | ]; 146 | 147 | statelessTools.forEach((toolName) => { 148 | const schema = createStatelessSchema( 149 | z.object({ 150 | url: z.string().url(), 151 | }), 152 | toolName, 153 | ); 154 | 155 | // Should reject session_id 156 | expect(() => { 157 | schema.parse({ 158 | url: 'https://example.com', 159 | session_id: 'test', 160 | }); 161 | }).toThrow(`${toolName} does not support session_id`); 162 | 163 | // Should accept without session_id 164 | const result = schema.parse({ 165 | url: 'https://example.com', 166 | }); 167 | expect(result).toEqual({ 168 | url: 'https://example.com', 169 | }); 170 | }); 171 | }); 172 | }); 173 | 174 | describe('Extract links tool', () => { 175 | it('should validate extract_links parameters', () => { 176 | const ExtractLinksSchema = z.object({ 177 | url: z.string().url(), 178 | categorize: z.boolean().optional().default(true), 179 | }); 180 | 181 | // Valid input with categorize true 182 | const result1 = ExtractLinksSchema.parse({ 183 | url: 'https://example.com', 184 | categorize: true, 185 | }); 186 | expect(result1.categorize).toBe(true); 187 | 188 | // Valid input with categorize false 189 | const result2 = ExtractLinksSchema.parse({ 190 | url: 'https://example.com', 191 | categorize: false, 192 | }); 193 | expect(result2.categorize).toBe(false); 194 | 195 | // Default categorize should be true 196 | const result3 = ExtractLinksSchema.parse({ 197 | url: 'https://example.com', 198 | }); 199 | expect(result3.categorize).toBe(true); 200 | }); 201 | }); 202 | 203 | describe('Session management tools', () => { 204 | it('should validate create_session parameters', () => { 205 | const CreateSessionSchema = z.object({ 206 | session_id: z.string(), 207 | initial_url: z.string().optional(), 208 | browser_type: z.string().optional(), 209 | }); 210 | 211 | // Valid input 212 | const result = CreateSessionSchema.parse({ 213 | session_id: 'my-session', 214 | initial_url: 'https://example.com', 215 | }); 216 | expect(result.session_id).toBe('my-session'); 217 | 218 | // Missing required session_id 219 | expect(() => { 220 | CreateSessionSchema.parse({ 221 | initial_url: 'https://example.com', 222 | }); 223 | }).toThrow(); 224 | }); 225 | 226 | it('should validate clear_session parameters', () => { 227 | const ClearSessionSchema = z.object({ 228 | session_id: z.string(), 229 | }); 230 | 231 | // Valid input 232 | const result = ClearSessionSchema.parse({ 233 | session_id: 'my-session', 234 | }); 235 | expect(result.session_id).toBe('my-session'); 236 | 237 | // Missing required session_id 238 | expect(() => { 239 | ClearSessionSchema.parse({}); 240 | }).toThrow(); 241 | }); 242 | }); 243 | 244 | describe('crawl validation', () => { 245 | it('should accept session_id for crawl', () => { 246 | const CrawlWithConfigSchema = z.object({ 247 | url: z.string().url(), 248 | session_id: z.string().optional(), 249 | js_code: z.union([z.string(), z.array(z.string())]).optional(), 250 | }); 251 | 252 | const result = CrawlWithConfigSchema.parse({ 253 | url: 'https://example.com', 254 | session_id: 'my-session', 255 | js_code: 'document.querySelector("button").click()', 256 | }); 257 | 258 | expect(result.session_id).toBe('my-session'); 259 | }); 260 | 261 | it('should work without session_id', () => { 262 | const CrawlWithConfigSchema = z.object({ 263 | url: z.string().url(), 264 | session_id: z.string().optional(), 265 | }); 266 | 267 | const result = CrawlWithConfigSchema.parse({ 268 | url: 'https://example.com', 269 | }); 270 | 271 | expect(result.session_id).toBeUndefined(); 272 | }); 273 | 274 | it('should require js_only when using js_code with session_id WITHOUT output options', () => { 275 | // Create a schema that mirrors the real one's refinement 276 | const CrawlWithConfigSchema = z 277 | .object({ 278 | url: z.string().url(), 279 | session_id: z.string().optional(), 280 | js_code: z.union([z.string(), z.array(z.string())]).optional(), 281 | js_only: z.boolean().optional(), 282 | screenshot: z.boolean().optional(), 283 | pdf: z.boolean().optional(), 284 | }) 285 | .refine( 286 | (data) => { 287 | // Only require js_only when using js_code + session_id WITHOUT any output options 288 | if (data.js_code && data.session_id && !data.js_only && !data.screenshot && !data.pdf) { 289 | return false; 290 | } 291 | return true; 292 | }, 293 | { 294 | message: 295 | 'When using js_code with session_id WITHOUT screenshot or pdf, you MUST set js_only: true to prevent server errors. If you want screenshots/PDFs, you can omit js_only. Correct usage: crawl({url, session_id, js_code: [...], js_only: true})', 296 | }, 297 | ); 298 | 299 | // Should fail without js_only when no output options 300 | expect(() => { 301 | CrawlWithConfigSchema.parse({ 302 | url: 'https://example.com', 303 | session_id: 'test-session', 304 | js_code: ['document.querySelector("button").click()'], 305 | }); 306 | }).toThrow('When using js_code with session_id WITHOUT screenshot or pdf'); 307 | 308 | // Should pass with js_only: true 309 | const result = CrawlWithConfigSchema.parse({ 310 | url: 'https://example.com', 311 | session_id: 'test-session', 312 | js_code: ['document.querySelector("button").click()'], 313 | js_only: true, 314 | }); 315 | expect(result.js_only).toBe(true); 316 | 317 | // Should pass with screenshot (no js_only required) 318 | const result2 = CrawlWithConfigSchema.parse({ 319 | url: 'https://example.com', 320 | session_id: 'test-session', 321 | js_code: ['document.querySelector("button").click()'], 322 | screenshot: true, 323 | }); 324 | expect(result2.screenshot).toBe(true); 325 | expect(result2.js_only).toBeUndefined(); 326 | 327 | // Should pass with pdf (no js_only required) 328 | const result3 = CrawlWithConfigSchema.parse({ 329 | url: 'https://example.com', 330 | session_id: 'test-session', 331 | js_code: ['document.querySelector("button").click()'], 332 | pdf: true, 333 | }); 334 | expect(result3.pdf).toBe(true); 335 | expect(result3.js_only).toBeUndefined(); 336 | 337 | // Should pass without js_code 338 | const result4 = CrawlWithConfigSchema.parse({ 339 | url: 'https://example.com', 340 | session_id: 'test-session', 341 | }); 342 | expect(result4.session_id).toBe('test-session'); 343 | 344 | // Should pass without session_id 345 | const result5 = CrawlWithConfigSchema.parse({ 346 | url: 'https://example.com', 347 | js_code: ['document.querySelector("button").click()'], 348 | }); 349 | expect(result5.js_code).toBeDefined(); 350 | }); 351 | }); 352 | 353 | describe('JavaScript code validation', () => { 354 | const validateJavaScriptCode = (code: string): boolean => { 355 | if (/"|&|<|>|&#\d+;|&\w+;/.test(code)) { 356 | return false; 357 | } 358 | if (/<(!DOCTYPE|html|body|head|script|style)\b/i.test(code)) { 359 | return false; 360 | } 361 | if (/[;})]\s*\\n|\\n\s*[{(/]/.test(code)) { 362 | return false; 363 | } 364 | if (/[;})]\s*\\n\s*\w/.test(code)) { 365 | return false; 366 | } 367 | return true; 368 | }; 369 | 370 | const JsCodeSchema = z.union([ 371 | z.string().refine(validateJavaScriptCode, { 372 | message: 373 | 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', 374 | }), 375 | z.array( 376 | z.string().refine(validateJavaScriptCode, { 377 | message: 378 | 'Invalid JavaScript: Contains HTML entities ("), literal \\n outside strings, or HTML tags. Use proper JS syntax with real quotes and newlines.', 379 | }), 380 | ), 381 | ]); 382 | 383 | it('should reject JavaScript with HTML entities', () => { 384 | expect(() => { 385 | JsCodeSchema.parse('document.querySelector("button").click()'); 386 | }).toThrow('Invalid JavaScript: Contains HTML entities'); 387 | }); 388 | 389 | it('should reject JavaScript with literal \\n between statements', () => { 390 | expect(() => { 391 | JsCodeSchema.parse('console.log("line1");\\nconsole.log("line2")'); 392 | }).toThrow('Invalid JavaScript: Contains HTML entities'); 393 | }); 394 | 395 | it('should accept valid JavaScript with \\n inside strings', () => { 396 | const result = JsCodeSchema.parse('console.log("line1\\nline2")'); 397 | expect(result).toBe('console.log("line1\\nline2")'); 398 | }); 399 | 400 | it('should accept valid multiline JavaScript', () => { 401 | const code = `// Comment 402 | document.querySelector('button').click(); 403 | return true;`; 404 | const result = JsCodeSchema.parse(code); 405 | expect(result).toBe(code); 406 | }); 407 | 408 | it('should validate arrays of JavaScript code', () => { 409 | // Invalid array 410 | expect(() => { 411 | JsCodeSchema.parse(['document.querySelector("input").value = "test"', 'form.submit()']); 412 | }).toThrow('Invalid JavaScript: Contains HTML entities'); 413 | 414 | // Valid array 415 | const validArray = ['document.querySelector("input").value = "test"', 'form.submit()']; 416 | const result = JsCodeSchema.parse(validArray); 417 | expect(result).toEqual(validArray); 418 | }); 419 | }); 420 | }); 421 | ``` -------------------------------------------------------------------------------- /src/__tests__/handlers/crawl-handlers.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | /* eslint-env jest */ 2 | import { jest } from '@jest/globals'; 3 | import { AxiosError } from 'axios'; 4 | import type { CrawlHandlers as CrawlHandlersType } from '../../handlers/crawl-handlers.js'; 5 | import type { Crawl4AIService } from '../../crawl4ai-service.js'; 6 | 7 | // Mock the service 8 | const mockCrawl = jest.fn(); 9 | const mockService = { 10 | crawl: mockCrawl, 11 | } as unknown as Crawl4AIService; 12 | 13 | // Mock axios client 14 | const mockPost = jest.fn() as jest.Mock; 15 | const mockHead = jest.fn() as jest.Mock; 16 | const mockAxiosClient = { 17 | post: mockPost, 18 | head: mockHead, 19 | } as any; // eslint-disable-line @typescript-eslint/no-explicit-any 20 | 21 | // Mock axios for parseSitemap 22 | const mockAxiosGet = jest.fn(); 23 | jest.unstable_mockModule('axios', () => ({ 24 | default: { 25 | get: mockAxiosGet, 26 | }, 27 | AxiosError, 28 | })); 29 | 30 | // Import after setting up mocks 31 | const { CrawlHandlers: CrawlHandlersClass } = await import('../../handlers/crawl-handlers.js'); 32 | 33 | describe('CrawlHandlers', () => { 34 | let handler: CrawlHandlersType; 35 | let sessions: Map<string, any>; // eslint-disable-line @typescript-eslint/no-explicit-any 36 | 37 | beforeEach(() => { 38 | jest.clearAllMocks(); 39 | sessions = new Map(); 40 | handler = new CrawlHandlersClass(mockService, mockAxiosClient, sessions); 41 | }); 42 | 43 | describe('batchCrawl', () => { 44 | it('should handle API errors gracefully', async () => { 45 | // Mock API error response 46 | (mockPost as jest.Mock).mockRejectedValue( 47 | new AxiosError('Request failed with status code 500', 'ERR_BAD_RESPONSE', undefined, undefined, { 48 | status: 500, 49 | statusText: 'Internal Server Error', 50 | data: 'Internal Server Error', 51 | headers: {}, 52 | config: {} as any, // eslint-disable-line @typescript-eslint/no-explicit-any 53 | } as any), // eslint-disable-line @typescript-eslint/no-explicit-any 54 | ); 55 | 56 | await expect( 57 | handler.batchCrawl({ 58 | urls: ['not-a-valid-url', 'https://invalid-domain.com'], 59 | max_concurrent: 2, 60 | }), 61 | ).rejects.toThrow('Failed to batch crawl: Internal Server Error'); 62 | }); 63 | 64 | it('should support per-URL configs array', async () => { 65 | (mockPost as jest.Mock).mockResolvedValue({ 66 | data: { 67 | results: [ 68 | { url: 'https://example1.com', success: true, markdown: { raw_markdown: 'Test 1' } }, 69 | { url: 'https://example2.com', success: true, markdown: { raw_markdown: 'Test 2' } }, 70 | ], 71 | }, 72 | }); 73 | 74 | const result = await handler.batchCrawl({ 75 | urls: ['https://example1.com', 'https://example2.com'], 76 | configs: [ 77 | { 78 | url: 'https://example1.com', 79 | browser_config: { browser_type: 'chromium' }, 80 | crawler_config: { screenshot: true }, 81 | }, 82 | { 83 | url: 'https://example2.com', 84 | browser_config: { browser_type: 'undetected' }, 85 | crawler_config: { pdf: true }, 86 | extraction_strategy: { provider: 'openai' }, 87 | }, 88 | ], 89 | max_concurrent: 2, 90 | }); 91 | 92 | // Verify the configs array was passed through 93 | expect(mockPost).toHaveBeenCalledWith( 94 | '/crawl', 95 | expect.objectContaining({ 96 | configs: expect.arrayContaining([ 97 | expect.objectContaining({ 98 | url: 'https://example1.com', 99 | browser_config: { browser_type: 'chromium' }, 100 | crawler_config: { screenshot: true }, 101 | }), 102 | expect.objectContaining({ 103 | url: 'https://example2.com', 104 | browser_config: { browser_type: 'undetected' }, 105 | crawler_config: { pdf: true }, 106 | extraction_strategy: { provider: 'openai' }, 107 | }), 108 | ]), 109 | max_concurrent: 2, 110 | }), 111 | ); 112 | 113 | expect(result.content[0].text).toContain('Batch crawl completed'); 114 | }); 115 | }); 116 | 117 | describe('smartCrawl', () => { 118 | it('should detect XML content type from HEAD request', async () => { 119 | // Mock HEAD response with XML content type 120 | (mockHead as jest.Mock).mockResolvedValue({ 121 | headers: { 122 | 'content-type': 'application/xml', 123 | }, 124 | }); 125 | 126 | // Mock crawl response 127 | (mockPost as jest.Mock).mockResolvedValue({ 128 | data: { 129 | results: [ 130 | { 131 | success: true, 132 | markdown: { 133 | raw_markdown: '<xml>Test content</xml>', 134 | }, 135 | }, 136 | ], 137 | }, 138 | }); 139 | 140 | const result = await handler.smartCrawl({ 141 | url: 'https://example.com/data.xml', 142 | }); 143 | 144 | expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); 145 | expect(result.content[0].text).toContain('<xml>Test content</xml>'); 146 | }); 147 | 148 | it('should handle HEAD request failure gracefully', async () => { 149 | // Mock HEAD request failure 150 | (mockHead as jest.Mock).mockRejectedValue(new Error('HEAD request failed')); 151 | 152 | // Mock successful crawl 153 | (mockPost as jest.Mock).mockResolvedValue({ 154 | data: { 155 | results: [ 156 | { 157 | success: true, 158 | markdown: { 159 | raw_markdown: 'Test content', 160 | }, 161 | }, 162 | ], 163 | }, 164 | }); 165 | 166 | const result = await handler.smartCrawl({ 167 | url: 'https://example.com', 168 | }); 169 | 170 | expect(result.content[0].text).toContain('Smart crawl detected content type: html'); 171 | }); 172 | 173 | it('should follow links from sitemap when follow_links is true', async () => { 174 | // Mock successful HEAD request 175 | (mockHead as jest.Mock).mockResolvedValue({ 176 | headers: { 177 | 'content-type': 'application/xml', 178 | }, 179 | }); 180 | 181 | // Mock initial crawl with sitemap content 182 | (mockPost as jest.Mock).mockResolvedValueOnce({ 183 | data: { 184 | results: [ 185 | { 186 | success: true, 187 | markdown: `<?xml version="1.0" encoding="UTF-8"?> 188 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 189 | <url> 190 | <loc>https://example.com/page1</loc> 191 | </url> 192 | <url> 193 | <loc>https://example.com/page2</loc> 194 | </url> 195 | </urlset>`, 196 | }, 197 | ], 198 | }, 199 | }); 200 | 201 | // Mock follow-up crawl 202 | (mockPost as jest.Mock).mockResolvedValueOnce({ 203 | data: { 204 | results: [{ success: true }, { success: true }], 205 | }, 206 | }); 207 | 208 | const result = await handler.smartCrawl({ 209 | url: 'https://example.com/sitemap.xml', 210 | follow_links: true, 211 | max_depth: 2, 212 | }); 213 | 214 | expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); 215 | expect(result.content[0].text).toContain('Followed 2 links:'); 216 | expect(result.content[0].text).toContain('https://example.com/page1'); 217 | expect(result.content[0].text).toContain('https://example.com/page2'); 218 | }); 219 | 220 | it('should handle smartCrawl API errors', async () => { 221 | (mockHead as jest.Mock).mockResolvedValue({ headers: {} }); 222 | // Mock crawl to get empty results first, then error on follow-up 223 | (mockPost as jest.Mock).mockResolvedValueOnce({ 224 | data: { 225 | results: [], 226 | }, 227 | }); 228 | 229 | const result = await handler.smartCrawl({ 230 | url: 'https://example.com', 231 | }); 232 | 233 | // With empty results, it should still return a response 234 | expect(result.content[0].text).toContain('Smart crawl detected content type: html'); 235 | expect(result.content[0].text).toContain('No content extracted'); 236 | }); 237 | }); 238 | 239 | describe('crawlRecursive', () => { 240 | it('should handle max_depth limit correctly', async () => { 241 | // Mock successful crawl with links 242 | (mockPost as jest.Mock).mockResolvedValueOnce({ 243 | data: { 244 | results: [ 245 | { 246 | success: true, 247 | markdown: { 248 | raw_markdown: 'Test content', 249 | }, 250 | links: { 251 | internal: [{ href: 'https://example.com/page1' }, { href: 'https://example.com/page2' }], 252 | external: [], 253 | }, 254 | }, 255 | ], 256 | }, 257 | }); 258 | 259 | // Mock second crawl for page1 260 | (mockPost as jest.Mock).mockResolvedValueOnce({ 261 | data: { 262 | results: [ 263 | { 264 | success: true, 265 | markdown: { 266 | raw_markdown: 'Page 1 content', 267 | }, 268 | links: { 269 | internal: [], 270 | external: [], 271 | }, 272 | }, 273 | ], 274 | }, 275 | }); 276 | 277 | // Mock third crawl for page2 278 | (mockPost as jest.Mock).mockResolvedValueOnce({ 279 | data: { 280 | results: [ 281 | { 282 | success: true, 283 | markdown: { 284 | raw_markdown: 'Page 2 content', 285 | }, 286 | links: { 287 | internal: [], 288 | external: [], 289 | }, 290 | }, 291 | ], 292 | }, 293 | }); 294 | 295 | const result = await handler.crawlRecursive({ 296 | url: 'https://example.com', 297 | max_depth: 1, // Should crawl initial URL and one level deep 298 | }); 299 | 300 | expect(result.content[0].text).toContain('Pages crawled: 3'); // Initial + 2 pages at depth 1 301 | expect(result.content[0].text).toContain('Max depth reached: 1'); 302 | expect(mockPost).toHaveBeenCalledTimes(3); // Initial crawl + two more 303 | }); 304 | 305 | it('should handle invalid URLs in discovered links', async () => { 306 | // Mock crawl with invalid link 307 | (mockPost as jest.Mock).mockResolvedValue({ 308 | data: { 309 | results: [ 310 | { 311 | success: true, 312 | markdown: { 313 | raw_markdown: 'Test content', 314 | }, 315 | links: { 316 | internal: [ 317 | { href: 'javascript:void(0)' }, // Invalid URL 318 | { href: 'https://example.com/valid' }, // Valid URL 319 | ], 320 | external: [], 321 | }, 322 | }, 323 | ], 324 | }, 325 | }); 326 | 327 | const result = await handler.crawlRecursive({ 328 | url: 'https://example.com', 329 | max_depth: 1, 330 | }); 331 | 332 | // Should continue despite invalid URL 333 | expect(result.content[0].text).toContain('Pages crawled:'); 334 | }); 335 | 336 | it('should handle crawl failures during recursion', async () => { 337 | // First crawl succeeds 338 | (mockPost as jest.Mock).mockResolvedValueOnce({ 339 | data: { 340 | results: [ 341 | { 342 | success: true, 343 | markdown: { 344 | raw_markdown: 'Test content', 345 | }, 346 | links: { 347 | internal: [{ href: 'https://example.com/page1' }], 348 | external: [], 349 | }, 350 | }, 351 | ], 352 | }, 353 | }); 354 | 355 | // Second crawl fails 356 | (mockPost as jest.Mock).mockRejectedValueOnce(new Error('Crawl failed')); 357 | 358 | const result = await handler.crawlRecursive({ 359 | url: 'https://example.com', 360 | max_depth: 1, 361 | }); 362 | 363 | // Should continue despite failure 364 | expect(result.content[0].text).toContain('Pages crawled: 1'); 365 | }); 366 | 367 | it('should handle crawlRecursive API errors', async () => { 368 | (mockPost as jest.Mock).mockRejectedValue(new Error('API Error')); 369 | 370 | const result = await handler.crawlRecursive({ 371 | url: 'https://example.com', 372 | }); 373 | 374 | // When the initial crawl fails, it should return a result with no pages crawled 375 | expect(result.content[0].text).toContain('Pages crawled: 0'); 376 | expect(result.content[0].text).toContain('No pages could be crawled'); 377 | }); 378 | }); 379 | 380 | describe('parseSitemap', () => { 381 | it('should handle network errors gracefully', async () => { 382 | // Mock ENOTFOUND error 383 | const error = new Error('getaddrinfo ENOTFOUND not-a-real-domain-12345.com'); 384 | (error as { code?: string }).code = 'ENOTFOUND'; 385 | mockAxiosGet.mockRejectedValue(error); 386 | 387 | await expect( 388 | handler.parseSitemap({ 389 | url: 'https://not-a-real-domain-12345.com/sitemap.xml', 390 | }), 391 | ).rejects.toThrow('Failed to parse sitemap: getaddrinfo ENOTFOUND not-a-real-domain-12345.com'); 392 | }); 393 | }); 394 | 395 | describe('crawl', () => { 396 | it('should handle word_count_threshold parameter', async () => { 397 | (mockCrawl as jest.Mock).mockResolvedValue({ 398 | results: [ 399 | { 400 | success: true, 401 | markdown: { 402 | raw_markdown: 'Test content', 403 | }, 404 | }, 405 | ], 406 | }); 407 | 408 | const result = await handler.crawl({ 409 | url: 'https://example.com', 410 | word_count_threshold: 100, 411 | }); 412 | 413 | expect(mockCrawl).toHaveBeenCalledWith( 414 | expect.objectContaining({ 415 | crawler_config: expect.objectContaining({ 416 | word_count_threshold: 100, 417 | }), 418 | }), 419 | ); 420 | expect(result.content[0].text).toBe('Test content'); 421 | }); 422 | 423 | it('should update session last_used time when using session_id', async () => { 424 | const sessionId = 'test-session'; 425 | const session = { 426 | id: sessionId, 427 | created_at: new Date(), 428 | last_used: new Date('2025-08-01'), 429 | }; 430 | sessions.set(sessionId, session); 431 | 432 | (mockCrawl as jest.Mock).mockResolvedValue({ 433 | results: [ 434 | { 435 | success: true, 436 | markdown: { 437 | raw_markdown: 'Test content', 438 | }, 439 | }, 440 | ], 441 | }); 442 | 443 | await handler.crawl({ 444 | url: 'https://example.com', 445 | session_id: sessionId, 446 | }); 447 | 448 | const updatedSession = sessions.get(sessionId) as { last_used: Date }; 449 | expect(updatedSession.last_used.getTime()).toBeGreaterThan(new Date('2025-08-01').getTime()); 450 | }); 451 | 452 | it('should handle image description parameters', async () => { 453 | (mockCrawl as jest.Mock).mockResolvedValue({ 454 | results: [ 455 | { 456 | success: true, 457 | markdown: { 458 | raw_markdown: 'Test content', 459 | }, 460 | }, 461 | ], 462 | }); 463 | 464 | await handler.crawl({ 465 | url: 'https://example.com', 466 | image_description_min_word_threshold: 10, 467 | image_score_threshold: 0.5, 468 | }); 469 | 470 | expect(mockCrawl).toHaveBeenCalledWith( 471 | expect.objectContaining({ 472 | crawler_config: expect.objectContaining({ 473 | image_description_min_word_threshold: 10, 474 | image_score_threshold: 0.5, 475 | }), 476 | }), 477 | ); 478 | }); 479 | 480 | it('should handle exclude_social_media_links parameter', async () => { 481 | (mockCrawl as jest.Mock).mockResolvedValue({ 482 | results: [ 483 | { 484 | success: true, 485 | markdown: { 486 | raw_markdown: 'Test content', 487 | }, 488 | }, 489 | ], 490 | }); 491 | 492 | await handler.crawl({ 493 | url: 'https://example.com', 494 | exclude_social_media_links: true, 495 | }); 496 | 497 | expect(mockCrawl).toHaveBeenCalledWith( 498 | expect.objectContaining({ 499 | crawler_config: expect.objectContaining({ 500 | exclude_social_media_links: true, 501 | }), 502 | }), 503 | ); 504 | }); 505 | 506 | it('should use extracted_content when available as string', async () => { 507 | (mockCrawl as jest.Mock).mockResolvedValue({ 508 | results: [ 509 | { 510 | success: true, 511 | extracted_content: 'Extracted text content', 512 | }, 513 | ], 514 | }); 515 | 516 | const result = await handler.crawl({ 517 | url: 'https://example.com', 518 | }); 519 | 520 | expect(result.content[0].text).toBe('Extracted text content'); 521 | }); 522 | 523 | it('should handle extracted_content as object', async () => { 524 | const extractedObj = { title: 'Test', body: 'Content' }; 525 | (mockCrawl as jest.Mock).mockResolvedValue({ 526 | results: [ 527 | { 528 | success: true, 529 | extracted_content: extractedObj, 530 | }, 531 | ], 532 | }); 533 | 534 | const result = await handler.crawl({ 535 | url: 'https://example.com', 536 | }); 537 | 538 | expect(result.content[0].text).toBe(JSON.stringify(extractedObj, null, 2)); 539 | }); 540 | 541 | it('should fallback to html when markdown is not available', async () => { 542 | (mockCrawl as jest.Mock).mockResolvedValue({ 543 | results: [ 544 | { 545 | success: true, 546 | html: '<html><body>HTML content</body></html>', 547 | }, 548 | ], 549 | }); 550 | 551 | const result = await handler.crawl({ 552 | url: 'https://example.com', 553 | }); 554 | 555 | expect(result.content[0].text).toBe('<html><body>HTML content</body></html>'); 556 | }); 557 | 558 | it('should fallback to fit_html when neither markdown nor html is available', async () => { 559 | (mockCrawl as jest.Mock).mockResolvedValue({ 560 | results: [ 561 | { 562 | success: true, 563 | fit_html: '<div>Fit HTML content</div>', 564 | }, 565 | ], 566 | }); 567 | 568 | const result = await handler.crawl({ 569 | url: 'https://example.com', 570 | }); 571 | 572 | expect(result.content[0].text).toBe('<div>Fit HTML content</div>'); 573 | }); 574 | 575 | it('should handle js_code as null error', async () => { 576 | await expect( 577 | handler.crawl({ 578 | url: 'https://example.com', 579 | js_code: null, 580 | }), 581 | ).rejects.toThrow( 582 | 'Failed to crawl: js_code parameter is null. Please provide JavaScript code as a string or array of strings.', 583 | ); 584 | }); 585 | }); 586 | }); 587 | ``` -------------------------------------------------------------------------------- /src/__tests__/crawl.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | /* eslint-env jest */ 2 | import { jest } from '@jest/globals'; 3 | import type { AxiosResponse } from 'axios'; 4 | import type { MockAxiosInstance } from './types/mocks.js'; 5 | import type { Crawl4AIService as Crawl4AIServiceType } from '../crawl4ai-service.js'; 6 | 7 | // Manual mock for axios 8 | const mockAxios = { 9 | create: jest.fn(), 10 | }; 11 | 12 | jest.unstable_mockModule('axios', () => ({ 13 | default: mockAxios, 14 | })); 15 | 16 | // Import modules after mocking 17 | const { Crawl4AIService } = await import('../crawl4ai-service.js'); 18 | 19 | // Helper function to create a complete AxiosResponse object 20 | function createMockAxiosResponse<T>(data: T): AxiosResponse<T> { 21 | return { 22 | data, 23 | status: 200, 24 | statusText: 'OK', 25 | headers: {}, 26 | config: { 27 | url: '', 28 | method: 'post', 29 | headers: {}, 30 | }, 31 | } as AxiosResponse<T>; 32 | } 33 | 34 | describe('crawl parameter mapping', () => { 35 | let service: Crawl4AIServiceType; 36 | let mockAxiosInstance: MockAxiosInstance; 37 | 38 | beforeEach(() => { 39 | mockAxiosInstance = { 40 | post: jest.fn(), 41 | get: jest.fn(), 42 | head: jest.fn(), 43 | }; 44 | mockAxios.create.mockReturnValue(mockAxiosInstance); 45 | service = new Crawl4AIService('http://test.com', 'test-key'); 46 | }); 47 | 48 | afterEach(() => { 49 | jest.clearAllMocks(); 50 | }); 51 | 52 | describe('Browser configuration mapping', () => { 53 | it('should map all browser config parameters correctly', async () => { 54 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 55 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 56 | 57 | await service.crawl({ 58 | url: 'https://example.com', 59 | browser_config: { 60 | browser_type: 'firefox', 61 | headless: true, 62 | viewport_width: 1920, 63 | viewport_height: 1080, 64 | user_agent: 'Custom User Agent', 65 | proxy_config: { 66 | server: 'http://proxy.com:8080', 67 | username: 'proxyuser', 68 | password: 'proxypass', 69 | }, 70 | cookies: [{ name: 'session', value: 'abc123', domain: '.example.com', path: '/' }], 71 | headers: { 'X-Custom-Header': 'value' }, 72 | extra_args: ['--disable-gpu'], 73 | }, 74 | }); 75 | 76 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 77 | urls: ['https://example.com'], 78 | browser_config: { 79 | browser_type: 'firefox', 80 | headless: true, 81 | viewport_width: 1920, 82 | viewport_height: 1080, 83 | user_agent: 'Custom User Agent', 84 | proxy_config: { 85 | server: 'http://proxy.com:8080', 86 | username: 'proxyuser', 87 | password: 'proxypass', 88 | }, 89 | cookies: [{ name: 'session', value: 'abc123', domain: '.example.com', path: '/' }], 90 | headers: { 'X-Custom-Header': 'value' }, 91 | extra_args: ['--disable-gpu'], 92 | }, 93 | crawler_config: {}, 94 | }); 95 | }); 96 | 97 | it('should support undetected browser type', async () => { 98 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 99 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 100 | 101 | await service.crawl({ 102 | url: 'https://example.com', 103 | browser_config: { 104 | browser_type: 'undetected', 105 | headless: true, 106 | }, 107 | }); 108 | 109 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 110 | urls: ['https://example.com'], 111 | browser_config: { 112 | browser_type: 'undetected', 113 | headless: true, 114 | }, 115 | crawler_config: {}, 116 | }); 117 | }); 118 | 119 | it('should support unified proxy format (string)', async () => { 120 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 121 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 122 | 123 | await service.crawl({ 124 | url: 'https://example.com', 125 | browser_config: { 126 | proxy: 'http://user:[email protected]:8080', 127 | }, 128 | }); 129 | 130 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 131 | urls: ['https://example.com'], 132 | browser_config: { 133 | proxy: 'http://user:[email protected]:8080', 134 | }, 135 | crawler_config: {}, 136 | }); 137 | }); 138 | 139 | it('should support unified proxy format (object)', async () => { 140 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 141 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 142 | 143 | await service.crawl({ 144 | url: 'https://example.com', 145 | browser_config: { 146 | proxy: { 147 | server: 'http://proxy.example.com:8080', 148 | username: 'user', 149 | password: 'pass', 150 | }, 151 | }, 152 | }); 153 | 154 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 155 | urls: ['https://example.com'], 156 | browser_config: { 157 | proxy: { 158 | server: 'http://proxy.example.com:8080', 159 | username: 'user', 160 | password: 'pass', 161 | }, 162 | }, 163 | crawler_config: {}, 164 | }); 165 | }); 166 | }); 167 | 168 | describe('Crawler configuration mapping', () => { 169 | it('should map content filtering parameters', async () => { 170 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 171 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 172 | 173 | await service.crawl({ 174 | url: 'https://example.com', 175 | crawler_config: { 176 | word_count_threshold: 150, 177 | excluded_tags: ['nav', 'footer', 'aside'], 178 | excluded_selector: '#ads, .popup', 179 | remove_overlay_elements: true, 180 | only_text: true, 181 | remove_forms: true, 182 | keep_data_attributes: true, 183 | }, 184 | }); 185 | 186 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 187 | urls: ['https://example.com'], 188 | browser_config: undefined, 189 | crawler_config: { 190 | word_count_threshold: 150, 191 | excluded_tags: ['nav', 'footer', 'aside'], 192 | excluded_selector: '#ads, .popup', 193 | remove_overlay_elements: true, 194 | only_text: true, 195 | remove_forms: true, 196 | keep_data_attributes: true, 197 | }, 198 | }); 199 | }); 200 | 201 | it('should map JavaScript execution parameters', async () => { 202 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 203 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 204 | 205 | await service.crawl({ 206 | url: 'https://example.com', 207 | crawler_config: { 208 | js_code: ['document.querySelector(".load-more").click()', 'window.scrollTo(0, 1000)'], 209 | js_only: true, 210 | wait_for: '.content-loaded', 211 | wait_for_timeout: 10000, 212 | }, 213 | }); 214 | 215 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 216 | urls: ['https://example.com'], 217 | browser_config: undefined, 218 | crawler_config: { 219 | js_code: ['document.querySelector(".load-more").click()', 'window.scrollTo(0, 1000)'], 220 | js_only: true, 221 | wait_for: '.content-loaded', 222 | wait_for_timeout: 10000, 223 | }, 224 | }); 225 | }); 226 | 227 | it('should map page navigation and timing parameters', async () => { 228 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 229 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 230 | 231 | await service.crawl({ 232 | url: 'https://example.com', 233 | crawler_config: { 234 | wait_until: 'networkidle', 235 | page_timeout: 45000, 236 | wait_for_images: true, 237 | ignore_body_visibility: false, 238 | scan_full_page: true, 239 | delay_before_scroll: 2000, 240 | scroll_delay: 1000, 241 | }, 242 | }); 243 | 244 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 245 | urls: ['https://example.com'], 246 | browser_config: undefined, 247 | crawler_config: { 248 | wait_until: 'networkidle', 249 | page_timeout: 45000, 250 | wait_for_images: true, 251 | ignore_body_visibility: false, 252 | scan_full_page: true, 253 | delay_before_scroll: 2000, 254 | scroll_delay: 1000, 255 | }, 256 | }); 257 | }); 258 | 259 | it('should map media handling parameters', async () => { 260 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 261 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 262 | 263 | await service.crawl({ 264 | url: 'https://example.com', 265 | crawler_config: { 266 | screenshot: true, 267 | screenshot_wait_for: 2.5, 268 | pdf: true, 269 | capture_mhtml: true, 270 | image_description_min_word_threshold: 30, 271 | image_score_threshold: 5, 272 | exclude_external_images: true, 273 | }, 274 | }); 275 | 276 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 277 | urls: ['https://example.com'], 278 | browser_config: undefined, 279 | crawler_config: { 280 | screenshot: true, 281 | screenshot_wait_for: 2.5, 282 | pdf: true, 283 | capture_mhtml: true, 284 | image_description_min_word_threshold: 30, 285 | image_score_threshold: 5, 286 | exclude_external_images: true, 287 | }, 288 | }); 289 | }); 290 | 291 | it('should map link filtering parameters', async () => { 292 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 293 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 294 | 295 | await service.crawl({ 296 | url: 'https://example.com', 297 | crawler_config: { 298 | exclude_external_links: true, 299 | exclude_social_media_links: true, 300 | exclude_domains: ['ads.com', 'tracker.io', 'analytics.com'], 301 | }, 302 | }); 303 | 304 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 305 | urls: ['https://example.com'], 306 | browser_config: undefined, 307 | crawler_config: { 308 | exclude_external_links: true, 309 | exclude_social_media_links: true, 310 | exclude_domains: ['ads.com', 'tracker.io', 'analytics.com'], 311 | }, 312 | }); 313 | }); 314 | 315 | it('should map page interaction parameters', async () => { 316 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 317 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 318 | 319 | await service.crawl({ 320 | url: 'https://example.com', 321 | crawler_config: { 322 | simulate_user: true, 323 | override_navigator: true, 324 | magic: true, 325 | process_iframes: true, 326 | }, 327 | }); 328 | 329 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 330 | urls: ['https://example.com'], 331 | browser_config: undefined, 332 | crawler_config: { 333 | simulate_user: true, 334 | override_navigator: true, 335 | magic: true, 336 | process_iframes: true, 337 | }, 338 | }); 339 | }); 340 | 341 | it('should map virtual scroll configuration', async () => { 342 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 343 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 344 | 345 | await service.crawl({ 346 | url: 'https://example.com', 347 | crawler_config: { 348 | virtual_scroll_config: { 349 | container_selector: '#timeline', 350 | scroll_count: 20, 351 | scroll_by: 'container_height', 352 | wait_after_scroll: 1.5, 353 | }, 354 | }, 355 | }); 356 | 357 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 358 | urls: ['https://example.com'], 359 | browser_config: undefined, 360 | crawler_config: { 361 | virtual_scroll_config: { 362 | container_selector: '#timeline', 363 | scroll_count: 20, 364 | scroll_by: 'container_height', 365 | wait_after_scroll: 1.5, 366 | }, 367 | }, 368 | }); 369 | }); 370 | 371 | // Note: Extraction strategies removed - not supported via REST API 372 | // Use extract_with_llm tool instead for structured data extraction 373 | 374 | it('should map session and cache parameters', async () => { 375 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 376 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 377 | 378 | await service.crawl({ 379 | url: 'https://example.com', 380 | crawler_config: { 381 | session_id: 'test-session-123', 382 | cache_mode: 'DISABLED', 383 | }, 384 | }); 385 | 386 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 387 | urls: ['https://example.com'], 388 | browser_config: undefined, 389 | crawler_config: { 390 | session_id: 'test-session-123', 391 | cache_mode: 'DISABLED', 392 | }, 393 | }); 394 | }); 395 | 396 | it('should map new crawler parameters', async () => { 397 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 398 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 399 | 400 | await service.crawl({ 401 | url: 'https://example.com', 402 | crawler_config: { 403 | delay_before_return_html: 2000, 404 | css_selector: '.main-content', 405 | include_links: true, 406 | resolve_absolute_urls: true, 407 | }, 408 | }); 409 | 410 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 411 | urls: ['https://example.com'], 412 | browser_config: undefined, 413 | crawler_config: { 414 | delay_before_return_html: 2000, 415 | css_selector: '.main-content', 416 | include_links: true, 417 | resolve_absolute_urls: true, 418 | }, 419 | }); 420 | }); 421 | 422 | it('should map performance and debug parameters', async () => { 423 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 424 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 425 | 426 | await service.crawl({ 427 | url: 'https://example.com', 428 | crawler_config: { 429 | timeout: 90000, 430 | verbose: true, 431 | log_console: true, 432 | }, 433 | }); 434 | 435 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 436 | urls: ['https://example.com'], 437 | browser_config: undefined, 438 | crawler_config: { 439 | timeout: 90000, 440 | verbose: true, 441 | log_console: true, 442 | }, 443 | }); 444 | }); 445 | }); 446 | 447 | describe('Extraction strategies', () => { 448 | it('should support extraction_strategy passthrough', async () => { 449 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 450 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 451 | 452 | await service.crawl({ 453 | url: 'https://example.com', 454 | extraction_strategy: { 455 | provider: 'openai', 456 | api_key: 'sk-test', 457 | model: 'gpt-4', 458 | temperature: 0.7, 459 | }, 460 | }); 461 | 462 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 463 | urls: ['https://example.com'], 464 | browser_config: undefined, 465 | crawler_config: {}, 466 | extraction_strategy: { 467 | provider: 'openai', 468 | api_key: 'sk-test', 469 | model: 'gpt-4', 470 | temperature: 0.7, 471 | }, 472 | }); 473 | }); 474 | 475 | it('should support table_extraction_strategy passthrough', async () => { 476 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 477 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 478 | 479 | await service.crawl({ 480 | url: 'https://example.com', 481 | table_extraction_strategy: { 482 | enable_chunking: true, 483 | thresholds: { 484 | min_rows: 5, 485 | max_columns: 20, 486 | }, 487 | }, 488 | }); 489 | 490 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 491 | urls: ['https://example.com'], 492 | browser_config: undefined, 493 | crawler_config: {}, 494 | table_extraction_strategy: { 495 | enable_chunking: true, 496 | thresholds: { 497 | min_rows: 5, 498 | max_columns: 20, 499 | }, 500 | }, 501 | }); 502 | }); 503 | 504 | it('should support markdown_generator_options passthrough', async () => { 505 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 506 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 507 | 508 | await service.crawl({ 509 | url: 'https://example.com', 510 | markdown_generator_options: { 511 | include_links: true, 512 | preserve_formatting: true, 513 | }, 514 | }); 515 | 516 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 517 | urls: ['https://example.com'], 518 | browser_config: undefined, 519 | crawler_config: {}, 520 | markdown_generator_options: { 521 | include_links: true, 522 | preserve_formatting: true, 523 | }, 524 | }); 525 | }); 526 | }); 527 | 528 | describe('Combined configurations', () => { 529 | it('should handle both browser and crawler configs together', async () => { 530 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 531 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 532 | 533 | await service.crawl({ 534 | url: 'https://example.com', 535 | browser_config: { 536 | viewport_width: 1920, 537 | viewport_height: 1080, 538 | user_agent: 'Custom Bot', 539 | }, 540 | crawler_config: { 541 | word_count_threshold: 100, 542 | js_code: 'document.querySelector(".accept").click()', 543 | wait_for: '.content', 544 | screenshot: true, 545 | session_id: 'test-session', 546 | cache_mode: 'BYPASS', 547 | }, 548 | }); 549 | 550 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 551 | urls: ['https://example.com'], 552 | browser_config: { 553 | viewport_width: 1920, 554 | viewport_height: 1080, 555 | user_agent: 'Custom Bot', 556 | }, 557 | crawler_config: { 558 | word_count_threshold: 100, 559 | js_code: 'document.querySelector(".accept").click()', 560 | wait_for: '.content', 561 | screenshot: true, 562 | session_id: 'test-session', 563 | cache_mode: 'BYPASS', 564 | }, 565 | }); 566 | }); 567 | }); 568 | 569 | describe('Edge cases', () => { 570 | it('should handle undefined values correctly', async () => { 571 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 572 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 573 | 574 | await service.crawl({ 575 | url: 'https://example.com', 576 | crawler_config: { 577 | word_count_threshold: 0, // Should be included (falsy but defined) 578 | excluded_tags: undefined, // Should not be included 579 | remove_overlay_elements: false, // Should be included 580 | only_text: undefined, // Should not be included 581 | }, 582 | }); 583 | 584 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 585 | urls: ['https://example.com'], 586 | browser_config: undefined, 587 | crawler_config: { 588 | word_count_threshold: 0, 589 | excluded_tags: undefined, 590 | remove_overlay_elements: false, 591 | only_text: undefined, 592 | }, 593 | }); 594 | }); 595 | 596 | it('should handle empty arrays correctly', async () => { 597 | const mockResponse = createMockAxiosResponse({ results: [{ markdown: 'test' }] }); 598 | mockAxiosInstance.post.mockResolvedValueOnce(mockResponse); 599 | 600 | await service.crawl({ 601 | url: 'https://example.com', 602 | crawler_config: { 603 | excluded_tags: [], 604 | exclude_domains: [], 605 | }, 606 | }); 607 | 608 | expect(mockAxiosInstance.post).toHaveBeenCalledWith('/crawl', { 609 | urls: ['https://example.com'], 610 | browser_config: undefined, 611 | crawler_config: { 612 | excluded_tags: [], 613 | exclude_domains: [], 614 | }, 615 | }); 616 | }); 617 | }); 618 | }); 619 | ```