This is page 4 of 4. Use http://codebase.md/omgwtfwow/mcp-crawl4ai-ts?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .env.example ├── .github │ ├── CI.md │ ├── copilot-instructions.md │ └── workflows │ └── ci.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── CHANGELOG.md ├── eslint.config.mjs ├── jest.config.cjs ├── jest.setup.cjs ├── LICENSE ├── package-lock.json ├── package.json ├── README.md ├── src │ ├── __tests__ │ │ ├── crawl.test.ts │ │ ├── crawl4ai-service.network.test.ts │ │ ├── crawl4ai-service.test.ts │ │ ├── handlers │ │ │ ├── crawl-handlers.test.ts │ │ │ ├── parameter-combinations.test.ts │ │ │ ├── screenshot-saving.test.ts │ │ │ ├── session-handlers.test.ts │ │ │ └── utility-handlers.test.ts │ │ ├── index.cli.test.ts │ │ ├── index.npx.test.ts │ │ ├── index.server.test.ts │ │ ├── index.test.ts │ │ ├── integration │ │ │ ├── batch-crawl.integration.test.ts │ │ │ ├── capture-screenshot.integration.test.ts │ │ │ ├── crawl-advanced.integration.test.ts │ │ │ ├── crawl-handlers.integration.test.ts │ │ │ ├── crawl-recursive.integration.test.ts │ │ │ ├── crawl.integration.test.ts │ │ │ ├── execute-js.integration.test.ts │ │ │ ├── extract-links.integration.test.ts │ │ │ ├── extract-with-llm.integration.test.ts │ │ │ ├── generate-pdf.integration.test.ts │ │ │ ├── get-html.integration.test.ts │ │ │ ├── get-markdown.integration.test.ts │ │ │ ├── parse-sitemap.integration.test.ts │ │ │ ├── session-management.integration.test.ts │ │ │ ├── smart-crawl.integration.test.ts │ │ │ └── test-utils.ts │ │ ├── request-handler.test.ts │ │ ├── schemas │ │ │ └── validation-edge-cases.test.ts │ │ ├── types │ │ │ └── mocks.ts │ │ └── utils │ │ └── javascript-validation.test.ts │ ├── crawl4ai-service.ts │ ├── handlers │ │ ├── base-handler.ts │ │ ├── content-handlers.ts │ │ ├── crawl-handlers.ts │ │ ├── session-handlers.ts │ │ └── utility-handlers.ts │ ├── index.ts │ ├── schemas │ │ ├── helpers.ts │ │ └── validation-schemas.ts │ ├── server.ts │ └── types.ts ├── tsconfig.build.json └── tsconfig.json ``` # Files -------------------------------------------------------------------------------- /src/__tests__/integration/crawl.integration.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | /* eslint-env jest */ 2 | import { Client } from '@modelcontextprotocol/sdk/client/index.js'; 3 | import { 4 | createTestClient, 5 | cleanupTestClient, 6 | generateSessionId, 7 | expectSuccessfulCrawl, 8 | expectScreenshot, 9 | delay, 10 | TEST_TIMEOUTS, 11 | } from './test-utils.js'; 12 | 13 | interface ToolResult { 14 | content: Array<{ 15 | type: string; 16 | text?: string; 17 | data?: string; 18 | mimeType?: string; 19 | }>; 20 | } 21 | 22 | describe('crawl Integration Tests', () => { 23 | let client: Client; 24 | 25 | beforeAll(async () => { 26 | client = await createTestClient(); 27 | }, TEST_TIMEOUTS.medium); 28 | 29 | afterAll(async () => { 30 | if (client) { 31 | await cleanupTestClient(client); 32 | } 33 | }); 34 | 35 | describe('Basic Crawling', () => { 36 | it( 37 | 'should crawl a simple page with basic configuration', 38 | async () => { 39 | const result = await client.callTool({ 40 | name: 'crawl', 41 | arguments: { 42 | url: 'https://httpbin.org/html', 43 | cache_mode: 'BYPASS', 44 | word_count_threshold: 50, 45 | }, 46 | }); 47 | 48 | await expectSuccessfulCrawl(result); 49 | }, 50 | TEST_TIMEOUTS.short, 51 | ); 52 | 53 | it( 54 | 'should handle invalid URL gracefully', 55 | async () => { 56 | const result = await client.callTool({ 57 | name: 'crawl', 58 | arguments: { 59 | url: 'not-a-valid-url', 60 | cache_mode: 'BYPASS', 61 | }, 62 | }); 63 | 64 | const content = (result as ToolResult).content; 65 | expect(content).toHaveLength(1); 66 | expect(content[0].type).toBe('text'); 67 | expect(content[0].text).toContain('Error'); 68 | // Our Zod validation catches this before it reaches the API 69 | expect(content[0].text).toContain('Invalid parameters for crawl'); 70 | expect(content[0].text).toContain('Invalid url'); 71 | }, 72 | TEST_TIMEOUTS.short, 73 | ); 74 | 75 | it( 76 | 'should handle non-existent domain gracefully', 77 | async () => { 78 | const result = await client.callTool({ 79 | name: 'crawl', 80 | arguments: { 81 | url: 'https://this-domain-definitely-does-not-exist-12345.com', 82 | cache_mode: 'BYPASS', 83 | }, 84 | }); 85 | 86 | const content = (result as ToolResult).content; 87 | expect(content).toHaveLength(1); 88 | expect(content[0].type).toBe('text'); 89 | expect(content[0].text).toContain('Error'); 90 | // Could be DNS error, connection error, or "Internal Server Error" 91 | expect(content[0].text).toMatch(/Failed to crawl|Internal Server Error|DNS|connection/i); 92 | }, 93 | TEST_TIMEOUTS.medium, 94 | ); 95 | 96 | it( 97 | 'should handle browser configuration', 98 | async () => { 99 | const result = await client.callTool({ 100 | name: 'crawl', 101 | arguments: { 102 | url: 'https://httpbin.org/user-agent', 103 | viewport_width: 1920, 104 | viewport_height: 1080, 105 | user_agent: 'MCP Integration Test Bot', 106 | cache_mode: 'DISABLED', 107 | }, 108 | }); 109 | 110 | await expectSuccessfulCrawl(result); 111 | }, 112 | TEST_TIMEOUTS.short, 113 | ); 114 | }); 115 | 116 | describe('Dynamic Content Tests', () => { 117 | it( 118 | 'should execute JavaScript on page', 119 | async () => { 120 | const result = await client.callTool({ 121 | name: 'crawl', 122 | arguments: { 123 | url: 'https://httpbin.org/html', 124 | js_code: ['return document.querySelectorAll("a").length', 'return document.title'], 125 | wait_after_js: 1000, 126 | cache_mode: 'BYPASS', 127 | word_count_threshold: 10, 128 | }, 129 | }); 130 | 131 | await expectSuccessfulCrawl(result); 132 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 133 | expect(textContent?.text).toBeTruthy(); 134 | // httpbin.org/html contains links and a title 135 | expect(textContent?.text?.toLowerCase()).toMatch(/herman|melville|moby/); // Content from the page 136 | }, 137 | TEST_TIMEOUTS.medium, 138 | ); 139 | 140 | it( 141 | 'should wait for specific elements', 142 | async () => { 143 | const result = await client.callTool({ 144 | name: 'crawl', 145 | arguments: { 146 | url: 'https://httpbin.org/delay/2', 147 | wait_for: 'body', 148 | wait_for_timeout: 5000, 149 | cache_mode: 'BYPASS', 150 | }, 151 | }); 152 | 153 | await expectSuccessfulCrawl(result); 154 | }, 155 | TEST_TIMEOUTS.medium, 156 | ); 157 | 158 | it( 159 | 'should handle virtual scrolling for infinite feeds', 160 | async () => { 161 | const result = await client.callTool({ 162 | name: 'crawl', 163 | arguments: { 164 | url: 'https://github.com/trending', 165 | virtual_scroll_config: { 166 | container_selector: '.Box-row', 167 | scroll_count: 3, 168 | scroll_by: 'container_height', 169 | wait_after_scroll: 1.0, 170 | }, 171 | cache_mode: 'BYPASS', 172 | wait_for: '.Box-row', 173 | word_count_threshold: 50, 174 | }, 175 | }); 176 | 177 | await expectSuccessfulCrawl(result); 178 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 179 | // Should have captured multiple trending repos after scrolling 180 | expect(textContent?.text).toBeTruthy(); 181 | expect(textContent?.text?.length).toBeGreaterThan(1000); 182 | }, 183 | TEST_TIMEOUTS.long, 184 | ); 185 | }); 186 | 187 | describe('Session Management Tests', () => { 188 | it( 189 | 'should create and use a session', 190 | async () => { 191 | const sessionId = generateSessionId(); 192 | 193 | // First crawl with session 194 | const result1 = await client.callTool({ 195 | name: 'crawl', 196 | arguments: { 197 | url: 'https://github.com', 198 | session_id: sessionId, 199 | cache_mode: 'BYPASS', 200 | }, 201 | }); 202 | 203 | await expectSuccessfulCrawl(result1); 204 | 205 | // Second crawl reusing session 206 | const result2 = await client.callTool({ 207 | name: 'crawl', 208 | arguments: { 209 | url: 'https://github.com/features', 210 | session_id: sessionId, 211 | cache_mode: 'BYPASS', 212 | }, 213 | }); 214 | 215 | await expectSuccessfulCrawl(result2); 216 | }, 217 | TEST_TIMEOUTS.medium, 218 | ); 219 | 220 | it( 221 | 'should handle cookies in session', 222 | async () => { 223 | const sessionId = generateSessionId(); 224 | 225 | const result = await client.callTool({ 226 | name: 'crawl', 227 | arguments: { 228 | url: 'https://github.com', 229 | session_id: sessionId, 230 | cookies: [ 231 | { 232 | name: 'test_cookie', 233 | value: 'test_value', 234 | domain: '.github.com', 235 | path: '/', 236 | }, 237 | ], 238 | cache_mode: 'BYPASS', 239 | }, 240 | }); 241 | 242 | await expectSuccessfulCrawl(result); 243 | }, 244 | TEST_TIMEOUTS.medium, 245 | ); 246 | }); 247 | 248 | describe('Content Extraction Tests', () => { 249 | it.skip( 250 | 'should extract content using CSS selectors - SKIPPED: Not supported via REST API', 251 | async () => { 252 | // CSS extraction is not supported via the REST API due to Python class serialization limitations 253 | // This test is kept for documentation purposes but skipped 254 | const result = await client.callTool({ 255 | name: 'crawl', 256 | arguments: { 257 | url: 'https://www.google.com', 258 | extraction_type: 'css', 259 | css_selectors: { 260 | title: 'title', 261 | search_button: 'input[type="submit"]', 262 | logo: 'img[alt*="Google"]', 263 | }, 264 | cache_mode: 'BYPASS', 265 | word_count_threshold: 10, 266 | }, 267 | }); 268 | 269 | await expectSuccessfulCrawl(result); 270 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 271 | expect(textContent?.text).toBeTruthy(); 272 | }, 273 | TEST_TIMEOUTS.short, 274 | ); 275 | 276 | it( 277 | 'should extract content using LLM via extract_with_llm tool', 278 | async () => { 279 | // Note: This test requires the Crawl4AI server to have an LLM provider configured 280 | try { 281 | const result = await client.callTool({ 282 | name: 'extract_with_llm', 283 | arguments: { 284 | url: 'https://httpbin.org/html', 285 | query: 'Extract the main page title and any author names mentioned', 286 | }, 287 | }); 288 | 289 | expect(result).toBeTruthy(); 290 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 291 | expect(textContent?.text).toBeTruthy(); 292 | 293 | // The response should be JSON with an "answer" field 294 | try { 295 | const parsed = JSON.parse(textContent?.text || '{}'); 296 | expect(parsed).toHaveProperty('answer'); 297 | expect(typeof parsed.answer).toBe('string'); 298 | expect(parsed.answer.length).toBeGreaterThan(0); 299 | } catch { 300 | // If parsing fails, at least check we got text 301 | expect(textContent?.text?.length || 0).toBeGreaterThan(0); 302 | } 303 | } catch (error) { 304 | // If the server doesn't have LLM configured, it will return an error 305 | if (error instanceof Error && error.message?.includes('No LLM provider configured')) { 306 | console.log('⚠️ LLM extraction test skipped: Server needs LLM provider configured'); 307 | return; 308 | } 309 | throw error; 310 | } 311 | }, 312 | TEST_TIMEOUTS.long, 313 | ); 314 | }); 315 | 316 | describe('Media Handling Tests', () => { 317 | it( 318 | 'should capture screenshots', 319 | async () => { 320 | const result = await client.callTool({ 321 | name: 'crawl', 322 | arguments: { 323 | url: 'https://httpbin.org/html', 324 | screenshot: true, 325 | screenshot_wait_for: 1.0, 326 | cache_mode: 'BYPASS', 327 | }, 328 | }); 329 | 330 | await expectSuccessfulCrawl(result); 331 | await expectScreenshot(result); 332 | }, 333 | TEST_TIMEOUTS.medium, 334 | ); 335 | 336 | it( 337 | 'should generate PDF', 338 | async () => { 339 | const result = await client.callTool({ 340 | name: 'crawl', 341 | arguments: { 342 | url: 'https://httpbin.org/html', 343 | pdf: true, 344 | cache_mode: 'BYPASS', 345 | }, 346 | }); 347 | 348 | await expectSuccessfulCrawl(result); 349 | // PDF generation should return some content 350 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 351 | expect(textContent?.text).toBeTruthy(); 352 | // Should contain some content from the page 353 | expect(textContent?.text?.toLowerCase()).toContain('herman'); 354 | }, 355 | TEST_TIMEOUTS.medium, 356 | ); 357 | 358 | it( 359 | 'should handle image filtering', 360 | async () => { 361 | const result = await client.callTool({ 362 | name: 'crawl', 363 | arguments: { 364 | url: 'https://github.com', 365 | exclude_external_images: true, 366 | image_description_min_word_threshold: 20, 367 | image_score_threshold: 5, 368 | cache_mode: 'BYPASS', 369 | }, 370 | }); 371 | 372 | await expectSuccessfulCrawl(result); 373 | }, 374 | TEST_TIMEOUTS.medium, 375 | ); 376 | }); 377 | 378 | describe('Performance & Caching Tests', () => { 379 | it( 380 | 'should respect cache modes', 381 | async () => { 382 | const url = 'https://httpbin.org/html'; // Use a simpler page for cache testing 383 | 384 | // First request - populate cache with ENABLED mode 385 | const result1 = await client.callTool({ 386 | name: 'crawl', 387 | arguments: { 388 | url, 389 | cache_mode: 'ENABLED', 390 | word_count_threshold: 10, 391 | }, 392 | }); 393 | await expectSuccessfulCrawl(result1); 394 | const content1 = (result1 as ToolResult).content.find((c) => c.type === 'text')?.text; 395 | 396 | // Wait a bit to ensure cache is saved 397 | await delay(500); 398 | 399 | // Second request - should use cache (ENABLED mode) 400 | const startTime = Date.now(); 401 | const result2 = await client.callTool({ 402 | name: 'crawl', 403 | arguments: { 404 | url, 405 | cache_mode: 'ENABLED', 406 | word_count_threshold: 10, 407 | }, 408 | }); 409 | const cacheTime = Date.now() - startTime; 410 | await expectSuccessfulCrawl(result2); 411 | const content2 = (result2 as ToolResult).content.find((c) => c.type === 'text')?.text; 412 | 413 | // Content should be identical if cache was used 414 | expect(content2).toBe(content1); 415 | 416 | // Third request - bypass cache 417 | const bypassStartTime = Date.now(); 418 | const result3 = await client.callTool({ 419 | name: 'crawl', 420 | arguments: { 421 | url, 422 | cache_mode: 'BYPASS', 423 | word_count_threshold: 10, 424 | }, 425 | }); 426 | const bypassTime = Date.now() - bypassStartTime; 427 | await expectSuccessfulCrawl(result3); 428 | 429 | // Cache hit should typically be faster, but we'll make this test more lenient 430 | // Just verify all requests succeeded 431 | expect(cacheTime).toBeGreaterThan(0); 432 | expect(bypassTime).toBeGreaterThan(0); 433 | 434 | // Fourth request - DISABLED mode should not use cache 435 | const result4 = await client.callTool({ 436 | name: 'crawl', 437 | arguments: { 438 | url, 439 | cache_mode: 'DISABLED', 440 | word_count_threshold: 10, 441 | }, 442 | }); 443 | await expectSuccessfulCrawl(result4); 444 | }, 445 | TEST_TIMEOUTS.long, 446 | ); 447 | 448 | it( 449 | 'should handle timeout configuration', 450 | async () => { 451 | const result = await client.callTool({ 452 | name: 'crawl', 453 | arguments: { 454 | url: 'https://httpbin.org/delay/1', 455 | timeout: 20000, 456 | page_timeout: 15000, 457 | cache_mode: 'BYPASS', 458 | }, 459 | }); 460 | 461 | await expectSuccessfulCrawl(result); 462 | }, 463 | TEST_TIMEOUTS.short, 464 | ); 465 | }); 466 | 467 | describe('Content Filtering Tests', () => { 468 | it( 469 | 'should filter content by tags', 470 | async () => { 471 | const result = await client.callTool({ 472 | name: 'crawl', 473 | arguments: { 474 | url: 'https://httpbin.org/html', // Simpler page for testing 475 | excluded_tags: ['script', 'style', 'nav', 'footer', 'header'], 476 | word_count_threshold: 10, 477 | cache_mode: 'BYPASS', 478 | only_text: true, // Force text-only output 479 | remove_overlay_elements: true, 480 | }, 481 | }); 482 | 483 | await expectSuccessfulCrawl(result); 484 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 485 | expect(textContent?.text).toBeTruthy(); 486 | 487 | // Just verify we got content back - the server's filtering behavior may vary 488 | // With all the filters applied, content might be minimal 489 | expect(textContent?.text?.length).toBeGreaterThan(10); 490 | 491 | // Should contain some text from the page 492 | expect(textContent?.text).toBeTruthy(); 493 | }, 494 | TEST_TIMEOUTS.short, 495 | ); 496 | 497 | it( 498 | 'should filter content by selectors', 499 | async () => { 500 | const result = await client.callTool({ 501 | name: 'crawl', 502 | arguments: { 503 | url: 'https://github.com', 504 | excluded_selector: '.footer, .header-nav, [aria-label="Advertisement"]', 505 | remove_overlay_elements: true, 506 | cache_mode: 'BYPASS', 507 | }, 508 | }); 509 | 510 | await expectSuccessfulCrawl(result); 511 | }, 512 | TEST_TIMEOUTS.medium, 513 | ); 514 | 515 | it( 516 | 'should handle link filtering', 517 | async () => { 518 | const result = await client.callTool({ 519 | name: 'crawl', 520 | arguments: { 521 | url: 'https://github.com', 522 | exclude_external_links: true, 523 | exclude_social_media_links: true, 524 | exclude_domains: ['twitter.com', 'facebook.com', 'linkedin.com'], 525 | cache_mode: 'BYPASS', 526 | }, 527 | }); 528 | 529 | await expectSuccessfulCrawl(result); 530 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 531 | // Should not contain social media domains 532 | expect(textContent?.text).not.toMatch(/twitter\.com|facebook\.com/); 533 | }, 534 | TEST_TIMEOUTS.medium, 535 | ); 536 | }); 537 | 538 | describe('Bot Detection Avoidance Tests', () => { 539 | it( 540 | 'should simulate user behavior', 541 | async () => { 542 | const result = await client.callTool({ 543 | name: 'crawl', 544 | arguments: { 545 | url: 'https://github.com', 546 | simulate_user: true, 547 | override_navigator: true, 548 | magic: true, 549 | delay_before_scroll: 1000, 550 | scroll_delay: 500, 551 | cache_mode: 'BYPASS', 552 | }, 553 | }); 554 | 555 | await expectSuccessfulCrawl(result); 556 | }, 557 | TEST_TIMEOUTS.medium, 558 | ); 559 | 560 | it( 561 | 'should use custom headers and user agent', 562 | async () => { 563 | const result = await client.callTool({ 564 | name: 'crawl', 565 | arguments: { 566 | url: 'https://httpbin.org/headers', 567 | user_agent: 'Mozilla/5.0 (compatible; MCP Test Bot)', 568 | headers: { 569 | 'Accept-Language': 'en-US,en;q=0.9', 570 | 'Accept-Encoding': 'gzip, deflate, br', 571 | 'X-Custom-Header': 'MCP-Test', 572 | }, 573 | cache_mode: 'BYPASS', 574 | }, 575 | }); 576 | 577 | await expectSuccessfulCrawl(result); 578 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 579 | // httpbin returns headers in response 580 | expect(textContent?.text).toContain('MCP Test Bot'); 581 | expect(textContent?.text).toContain('X-Custom-Header'); 582 | }, 583 | TEST_TIMEOUTS.medium, 584 | ); 585 | }); 586 | 587 | describe('Error Handling Tests', () => { 588 | it( 589 | 'should handle invalid URLs gracefully', 590 | async () => { 591 | const result = await client.callTool({ 592 | name: 'crawl', 593 | arguments: { 594 | url: 'not-a-valid-url', 595 | cache_mode: 'BYPASS', 596 | }, 597 | }); 598 | 599 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 600 | expect(textContent?.text).toContain('Error'); 601 | }, 602 | TEST_TIMEOUTS.short, 603 | ); 604 | 605 | it( 606 | 'should handle non-existent domains', 607 | async () => { 608 | const result = await client.callTool({ 609 | name: 'crawl', 610 | arguments: { 611 | url: 'https://this-domain-definitely-does-not-exist-123456789.com', 612 | cache_mode: 'BYPASS', 613 | }, 614 | }); 615 | 616 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 617 | expect(textContent?.text?.toLowerCase()).toMatch(/error|failed/); 618 | }, 619 | TEST_TIMEOUTS.short, 620 | ); 621 | 622 | it( 623 | 'should handle JavaScript errors gracefully', 624 | async () => { 625 | const result = await client.callTool({ 626 | name: 'crawl', 627 | arguments: { 628 | url: 'https://httpbin.org/html', 629 | js_code: 'throw new Error("Test error")', 630 | cache_mode: 'BYPASS', 631 | }, 632 | }); 633 | 634 | // Should still return content even if JS fails 635 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 636 | expect(textContent).toBeDefined(); 637 | }, 638 | TEST_TIMEOUTS.short, 639 | ); 640 | }); 641 | 642 | describe('Advanced Configurations', () => { 643 | it( 644 | 'should handle complex multi-feature crawl', 645 | async () => { 646 | const sessionId = generateSessionId(); 647 | 648 | const result = await client.callTool({ 649 | name: 'crawl', 650 | arguments: { 651 | url: 'https://httpbin.org/html', 652 | // Browser config 653 | viewport_width: 1920, 654 | viewport_height: 1080, 655 | user_agent: 'MCP Advanced Test Bot', 656 | // Session 657 | session_id: sessionId, 658 | // JavaScript 659 | js_code: 'return document.querySelectorAll("h1").length', 660 | wait_after_js: 1000, 661 | // Content filtering 662 | excluded_tags: ['script', 'style'], 663 | word_count_threshold: 50, 664 | remove_overlay_elements: true, 665 | // Media 666 | screenshot: true, 667 | screenshot_wait_for: 1.0, 668 | // Performance 669 | cache_mode: 'DISABLED', 670 | timeout: 60000, 671 | // Bot detection 672 | simulate_user: true, 673 | override_navigator: true, 674 | }, 675 | }); 676 | 677 | await expectSuccessfulCrawl(result); 678 | // Screenshot might not always be returned in complex multi-feature crawls 679 | // especially with httpbin.org which is a simple HTML page 680 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 681 | expect(textContent).toBeDefined(); 682 | }, 683 | TEST_TIMEOUTS.long, 684 | ); 685 | 686 | it( 687 | 'should handle proxy configuration', 688 | async () => { 689 | // Test that proxy configuration is accepted, even without a real proxy 690 | const result = await client.callTool({ 691 | name: 'crawl', 692 | arguments: { 693 | url: 'https://httpbin.org/ip', 694 | proxy_server: 'http://example-proxy.com:8080', 695 | proxy_username: 'testuser', 696 | proxy_password: 'testpass', 697 | cache_mode: 'BYPASS', 698 | word_count_threshold: 10, 699 | }, 700 | }); 701 | 702 | // The request should complete (even if proxy doesn't exist, the config should be accepted) 703 | expect(result).toBeDefined(); 704 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 705 | expect(textContent).toBeDefined(); 706 | }, 707 | TEST_TIMEOUTS.medium, 708 | ); 709 | 710 | it( 711 | 'should process iframes', 712 | async () => { 713 | const result = await client.callTool({ 714 | name: 'crawl', 715 | arguments: { 716 | url: 'https://www.w3schools.com/html/html_iframe.asp', 717 | process_iframes: true, 718 | cache_mode: 'BYPASS', 719 | }, 720 | }); 721 | 722 | await expectSuccessfulCrawl(result); 723 | }, 724 | TEST_TIMEOUTS.medium, 725 | ); 726 | }); 727 | 728 | describe('Browser Configuration Tests', () => { 729 | describe('Cookie handling', () => { 730 | it( 731 | 'should set and send cookies correctly', 732 | async () => { 733 | const result = await client.callTool({ 734 | name: 'crawl', 735 | arguments: { 736 | url: 'https://httpbin.org/cookies', 737 | cookies: [ 738 | { 739 | name: 'test_cookie', 740 | value: 'test_value', 741 | domain: '.httpbin.org', 742 | path: '/', 743 | }, 744 | ], 745 | cache_mode: 'BYPASS', 746 | }, 747 | }); 748 | 749 | await expectSuccessfulCrawl(result); 750 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 751 | expect(textContent?.text).toBeTruthy(); 752 | // httpbin returns cookies as JSON in the response 753 | expect(textContent?.text).toContain('test_cookie'); 754 | expect(textContent?.text).toContain('test_value'); 755 | }, 756 | TEST_TIMEOUTS.short, 757 | ); 758 | 759 | it( 760 | 'should handle multiple cookies', 761 | async () => { 762 | const result = await client.callTool({ 763 | name: 'crawl', 764 | arguments: { 765 | url: 'https://httpbin.org/cookies', 766 | cookies: [ 767 | { 768 | name: 'session_id', 769 | value: 'abc123', 770 | domain: '.httpbin.org', 771 | path: '/', 772 | }, 773 | { 774 | name: 'user_pref', 775 | value: 'dark_mode', 776 | domain: '.httpbin.org', 777 | path: '/', 778 | }, 779 | ], 780 | cache_mode: 'BYPASS', 781 | }, 782 | }); 783 | 784 | await expectSuccessfulCrawl(result); 785 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 786 | expect(textContent?.text).toBeTruthy(); 787 | // Verify both cookies are present 788 | expect(textContent?.text).toContain('session_id'); 789 | expect(textContent?.text).toContain('abc123'); 790 | expect(textContent?.text).toContain('user_pref'); 791 | expect(textContent?.text).toContain('dark_mode'); 792 | }, 793 | TEST_TIMEOUTS.short, 794 | ); 795 | }); 796 | 797 | describe('Custom headers', () => { 798 | it( 799 | 'should send custom headers', 800 | async () => { 801 | const result = await client.callTool({ 802 | name: 'crawl', 803 | arguments: { 804 | url: 'https://httpbin.org/headers', 805 | headers: { 806 | 'X-Custom-Header': 'test-value', 807 | 'X-Request-ID': '12345', 808 | 'Accept-Language': 'en-US,en;q=0.9', 809 | }, 810 | cache_mode: 'BYPASS', 811 | }, 812 | }); 813 | 814 | await expectSuccessfulCrawl(result); 815 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 816 | expect(textContent?.text).toBeTruthy(); 817 | // httpbin returns headers in the response 818 | expect(textContent?.text).toContain('X-Custom-Header'); 819 | expect(textContent?.text).toContain('test-value'); 820 | // Note: Some headers may be filtered by the browser 821 | // Just verify our custom header got through 822 | }, 823 | TEST_TIMEOUTS.short, 824 | ); 825 | }); 826 | 827 | describe('User-Agent configuration', () => { 828 | it( 829 | 'should set custom user agent', 830 | async () => { 831 | const customUserAgent = 'MCP-Crawl4AI-Test/1.0 (Integration Tests)'; 832 | const result = await client.callTool({ 833 | name: 'crawl', 834 | arguments: { 835 | url: 'https://httpbin.org/user-agent', 836 | user_agent: customUserAgent, 837 | cache_mode: 'BYPASS', 838 | }, 839 | }); 840 | 841 | await expectSuccessfulCrawl(result); 842 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 843 | expect(textContent?.text).toBeTruthy(); 844 | // httpbin returns the user-agent in the response 845 | expect(textContent?.text).toContain(customUserAgent); 846 | }, 847 | TEST_TIMEOUTS.short, 848 | ); 849 | }); 850 | 851 | describe('Viewport sizes and screenshots', () => { 852 | it( 853 | 'should capture screenshot at mobile size (375x667)', 854 | async () => { 855 | const result = await client.callTool({ 856 | name: 'crawl', 857 | arguments: { 858 | url: 'https://httpbin.org/html', 859 | viewport_width: 375, 860 | viewport_height: 667, 861 | screenshot: true, 862 | screenshot_wait_for: 1, 863 | cache_mode: 'BYPASS', 864 | }, 865 | }); 866 | 867 | await expectSuccessfulCrawl(result); 868 | await expectScreenshot(result); 869 | 870 | // Check screenshot was captured 871 | const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); 872 | expect(imageContent).toBeDefined(); 873 | expect(imageContent?.data).toBeTruthy(); 874 | 875 | // Verify reasonable data size for mobile screenshot 876 | const dataLength = imageContent?.data?.length || 0; 877 | expect(dataLength).toBeGreaterThan(10000); // At least 10KB 878 | expect(dataLength).toBeLessThan(3000000); // Less than 3MB for mobile (base64 encoded) 879 | }, 880 | TEST_TIMEOUTS.medium, 881 | ); 882 | 883 | it( 884 | 'should capture screenshot at tablet size (768x1024)', 885 | async () => { 886 | const result = await client.callTool({ 887 | name: 'crawl', 888 | arguments: { 889 | url: 'https://httpbin.org/html', 890 | viewport_width: 768, 891 | viewport_height: 1024, 892 | screenshot: true, 893 | screenshot_wait_for: 1, 894 | cache_mode: 'BYPASS', 895 | }, 896 | }); 897 | 898 | await expectSuccessfulCrawl(result); 899 | await expectScreenshot(result); 900 | 901 | // Check screenshot was captured 902 | const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); 903 | expect(imageContent).toBeDefined(); 904 | expect(imageContent?.data).toBeTruthy(); 905 | 906 | // Verify reasonable data size for tablet screenshot 907 | const dataLength = imageContent?.data?.length || 0; 908 | expect(dataLength).toBeGreaterThan(15000); // At least 15KB 909 | expect(dataLength).toBeLessThan(3000000); // Less than 3MB for tablet (base64 encoded) 910 | }, 911 | TEST_TIMEOUTS.medium, 912 | ); 913 | 914 | it( 915 | 'should capture screenshot at HD size (1280x720)', 916 | async () => { 917 | const result = await client.callTool({ 918 | name: 'crawl', 919 | arguments: { 920 | url: 'https://httpbin.org/html', 921 | viewport_width: 1280, 922 | viewport_height: 720, 923 | screenshot: true, 924 | screenshot_wait_for: 1, 925 | cache_mode: 'BYPASS', 926 | }, 927 | }); 928 | 929 | await expectSuccessfulCrawl(result); 930 | await expectScreenshot(result); 931 | 932 | // Check screenshot was captured 933 | const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); 934 | expect(imageContent).toBeDefined(); 935 | expect(imageContent?.data).toBeTruthy(); 936 | 937 | // Verify reasonable data size for HD screenshot 938 | const dataLength = imageContent?.data?.length || 0; 939 | expect(dataLength).toBeGreaterThan(20000); // At least 20KB 940 | expect(dataLength).toBeLessThan(3000000); // Less than 3MB for HD (base64 encoded) 941 | }, 942 | TEST_TIMEOUTS.medium, 943 | ); 944 | 945 | it( 946 | 'should fail gracefully for very large viewport (1920x1080)', 947 | async () => { 948 | const result = await client.callTool({ 949 | name: 'crawl', 950 | arguments: { 951 | url: 'https://httpbin.org/html', 952 | viewport_width: 1920, 953 | viewport_height: 1080, 954 | screenshot: true, 955 | screenshot_wait_for: 1, 956 | cache_mode: 'BYPASS', 957 | }, 958 | }); 959 | 960 | // This should either timeout or return an error based on testing 961 | // We expect either an error or no screenshot data 962 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 963 | const imageContent = (result as ToolResult).content.find((c) => c.type === 'image'); 964 | 965 | // If we got text but no image, that's expected for large viewports 966 | if (textContent && !imageContent) { 967 | expect(textContent).toBeDefined(); 968 | } else if (textContent?.text?.includes('Error') || textContent?.text?.includes('timeout')) { 969 | // Expected error for large viewport 970 | expect(textContent.text).toMatch(/Error|timeout/i); 971 | } 972 | }, 973 | TEST_TIMEOUTS.long, 974 | ); 975 | }); 976 | 977 | describe('Combined browser configurations', () => { 978 | it( 979 | 'should handle cookies, headers, and custom viewport together', 980 | async () => { 981 | const result = await client.callTool({ 982 | name: 'crawl', 983 | arguments: { 984 | url: 'https://httpbin.org/anything', 985 | viewport_width: 768, 986 | viewport_height: 1024, 987 | user_agent: 'MCP-Test-Bot/2.0', 988 | cookies: [ 989 | { 990 | name: 'auth_token', 991 | value: 'secret123', 992 | domain: '.httpbin.org', 993 | path: '/', 994 | }, 995 | ], 996 | headers: { 997 | 'X-Test-Header': 'combined-test', 998 | }, 999 | cache_mode: 'BYPASS', 1000 | }, 1001 | }); 1002 | 1003 | await expectSuccessfulCrawl(result); 1004 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1005 | expect(textContent?.text).toBeTruthy(); 1006 | 1007 | // httpbin/anything endpoint returns all request data 1008 | // Verify all configurations were applied 1009 | expect(textContent?.text).toContain('MCP-Test-Bot/2.0'); 1010 | expect(textContent?.text).toContain('auth_token'); 1011 | expect(textContent?.text).toContain('X-Test-Header'); 1012 | expect(textContent?.text).toContain('combined-test'); 1013 | }, 1014 | TEST_TIMEOUTS.medium, 1015 | ); 1016 | }); 1017 | }); 1018 | 1019 | describe('Crawler Configuration Advanced Tests', () => { 1020 | describe('Content filtering parameters', () => { 1021 | it( 1022 | 'should remove forms when remove_forms is true', 1023 | async () => { 1024 | const result = await client.callTool({ 1025 | name: 'crawl', 1026 | arguments: { 1027 | url: 'https://httpbin.org/forms/post', 1028 | remove_forms: true, 1029 | cache_mode: 'BYPASS', 1030 | }, 1031 | }); 1032 | 1033 | await expectSuccessfulCrawl(result); 1034 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1035 | expect(textContent?.text).toBeTruthy(); 1036 | // Forms should be removed, so no form-related text should appear 1037 | expect(textContent?.text).not.toContain('<form'); 1038 | expect(textContent?.text).not.toContain('type="submit"'); 1039 | expect(textContent?.text).not.toContain('input type='); 1040 | }, 1041 | TEST_TIMEOUTS.short, 1042 | ); 1043 | 1044 | it( 1045 | 'should keep forms when remove_forms is false', 1046 | async () => { 1047 | const result = await client.callTool({ 1048 | name: 'crawl', 1049 | arguments: { 1050 | url: 'https://httpbin.org/forms/post', 1051 | remove_forms: false, 1052 | cache_mode: 'BYPASS', 1053 | word_count_threshold: 10, 1054 | }, 1055 | }); 1056 | 1057 | await expectSuccessfulCrawl(result); 1058 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1059 | expect(textContent?.text).toBeTruthy(); 1060 | // Forms should be present - check for form-related keywords 1061 | const text = textContent?.text?.toLowerCase() || ''; 1062 | // httpbin forms page should have form elements 1063 | expect(text.length).toBeGreaterThan(100); 1064 | }, 1065 | TEST_TIMEOUTS.short, 1066 | ); 1067 | 1068 | it( 1069 | 'should preserve data attributes when keep_data_attributes is true', 1070 | async () => { 1071 | const result = await client.callTool({ 1072 | name: 'crawl', 1073 | arguments: { 1074 | url: 'https://getbootstrap.com/docs/4.0/components/alerts/', 1075 | keep_data_attributes: true, 1076 | cache_mode: 'BYPASS', 1077 | word_count_threshold: 10, 1078 | }, 1079 | }); 1080 | 1081 | await expectSuccessfulCrawl(result); 1082 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1083 | expect(textContent?.text).toBeTruthy(); 1084 | // Should contain alert content 1085 | expect(textContent?.text).toContain('alert'); 1086 | }, 1087 | TEST_TIMEOUTS.medium, 1088 | ); 1089 | }); 1090 | 1091 | describe('JavaScript execution parameters', () => { 1092 | it( 1093 | 'should return only JS results when js_only is true', 1094 | async () => { 1095 | const result = await client.callTool({ 1096 | name: 'crawl', 1097 | arguments: { 1098 | url: 'https://httpbin.org/html', 1099 | js_code: ['return document.title', 'return document.querySelectorAll("p").length'], 1100 | js_only: true, 1101 | cache_mode: 'BYPASS', 1102 | }, 1103 | }); 1104 | 1105 | await expectSuccessfulCrawl(result); 1106 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1107 | expect(textContent?.text).toBeTruthy(); 1108 | 1109 | // Should contain JS execution results but not the full HTML content 1110 | // The result should be much shorter than full page content 1111 | expect(textContent?.text?.length).toBeLessThan(1000); 1112 | // Should not contain the full Moby Dick text from the page 1113 | expect(textContent?.text).not.toContain('Herman Melville'); 1114 | }, 1115 | TEST_TIMEOUTS.short, 1116 | ); 1117 | 1118 | it( 1119 | 'should handle js_only with session_id', 1120 | async () => { 1121 | const sessionId = generateSessionId(); 1122 | const result = await client.callTool({ 1123 | name: 'crawl', 1124 | arguments: { 1125 | url: 'https://httpbin.org/html', 1126 | session_id: sessionId, 1127 | js_code: 'return window.location.href', 1128 | js_only: true, 1129 | cache_mode: 'BYPASS', 1130 | }, 1131 | }); 1132 | 1133 | await expectSuccessfulCrawl(result); 1134 | }, 1135 | TEST_TIMEOUTS.short, 1136 | ); 1137 | }); 1138 | 1139 | describe('Page visibility parameters', () => { 1140 | it( 1141 | 'should extract content when body is hidden and ignore_body_visibility is true', 1142 | async () => { 1143 | const result = await client.callTool({ 1144 | name: 'crawl', 1145 | arguments: { 1146 | url: 'https://httpbin.org/html', 1147 | js_code: 'document.body.style.visibility = "hidden"; return "body hidden"', 1148 | ignore_body_visibility: true, 1149 | cache_mode: 'BYPASS', 1150 | word_count_threshold: 10, 1151 | }, 1152 | }); 1153 | 1154 | await expectSuccessfulCrawl(result); 1155 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1156 | expect(textContent?.text).toBeTruthy(); 1157 | // Should still extract content despite hidden body 1158 | expect(textContent?.text).toContain('Herman Melville'); 1159 | }, 1160 | TEST_TIMEOUTS.short, 1161 | ); 1162 | 1163 | it( 1164 | 'should respect body visibility when ignore_body_visibility is false', 1165 | async () => { 1166 | const result = await client.callTool({ 1167 | name: 'crawl', 1168 | arguments: { 1169 | url: 'https://httpbin.org/html', 1170 | js_code: 'document.body.style.visibility = "hidden"; return "body hidden"', 1171 | ignore_body_visibility: false, 1172 | cache_mode: 'BYPASS', 1173 | word_count_threshold: 10, 1174 | }, 1175 | }); 1176 | 1177 | await expectSuccessfulCrawl(result); 1178 | // Content extraction behavior may vary when body is hidden 1179 | }, 1180 | TEST_TIMEOUTS.short, 1181 | ); 1182 | }); 1183 | 1184 | describe('Debug and logging parameters', () => { 1185 | it( 1186 | 'should capture console logs when log_console is true', 1187 | async () => { 1188 | const result = await client.callTool({ 1189 | name: 'crawl', 1190 | arguments: { 1191 | url: 'https://httpbin.org/html', 1192 | js_code: [ 1193 | 'console.log("Test log message 1")', 1194 | 'console.warn("Test warning")', 1195 | 'console.error("Test error")', 1196 | 'return "logs executed"', 1197 | ], 1198 | log_console: true, 1199 | cache_mode: 'BYPASS', 1200 | }, 1201 | }); 1202 | 1203 | await expectSuccessfulCrawl(result); 1204 | // Note: Console logs may be captured in a separate field or in verbose output 1205 | }, 1206 | TEST_TIMEOUTS.short, 1207 | ); 1208 | 1209 | it( 1210 | 'should provide verbose output when verbose is true', 1211 | async () => { 1212 | const result = await client.callTool({ 1213 | name: 'crawl', 1214 | arguments: { 1215 | url: 'https://httpbin.org/html', 1216 | verbose: true, 1217 | cache_mode: 'BYPASS', 1218 | word_count_threshold: 50, 1219 | }, 1220 | }); 1221 | 1222 | await expectSuccessfulCrawl(result); 1223 | // Verbose output may include additional debugging information 1224 | }, 1225 | TEST_TIMEOUTS.short, 1226 | ); 1227 | }); 1228 | 1229 | describe('Media filtering parameters', () => { 1230 | it( 1231 | 'should exclude external images when exclude_external_images is true', 1232 | async () => { 1233 | // First, let's create a page with external images via JS 1234 | const result = await client.callTool({ 1235 | name: 'crawl', 1236 | arguments: { 1237 | url: 'https://httpbin.org/html', 1238 | js_code: ` 1239 | const img1 = document.createElement('img'); 1240 | img1.src = 'https://httpbin.org/image/png'; 1241 | img1.alt = 'External PNG'; 1242 | document.body.appendChild(img1); 1243 | 1244 | const img2 = document.createElement('img'); 1245 | img2.src = '/local-image.png'; 1246 | img2.alt = 'Local image'; 1247 | document.body.appendChild(img2); 1248 | 1249 | return document.images.length; 1250 | `, 1251 | exclude_external_images: true, 1252 | cache_mode: 'BYPASS', 1253 | }, 1254 | }); 1255 | 1256 | await expectSuccessfulCrawl(result); 1257 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1258 | expect(textContent?.text).toBeTruthy(); 1259 | // The external image references should be filtered out 1260 | }, 1261 | TEST_TIMEOUTS.medium, 1262 | ); 1263 | 1264 | it( 1265 | 'should include external images when exclude_external_images is false', 1266 | async () => { 1267 | const result = await client.callTool({ 1268 | name: 'crawl', 1269 | arguments: { 1270 | url: 'https://httpbin.org/html', 1271 | exclude_external_images: false, 1272 | cache_mode: 'BYPASS', 1273 | }, 1274 | }); 1275 | 1276 | await expectSuccessfulCrawl(result); 1277 | }, 1278 | TEST_TIMEOUTS.short, 1279 | ); 1280 | }); 1281 | 1282 | describe('Combined crawler configuration tests', () => { 1283 | it( 1284 | 'should handle multiple filtering options together', 1285 | async () => { 1286 | const result = await client.callTool({ 1287 | name: 'crawl', 1288 | arguments: { 1289 | url: 'https://httpbin.org/forms/post', 1290 | remove_forms: true, 1291 | exclude_external_links: true, 1292 | exclude_external_images: true, 1293 | only_text: true, 1294 | word_count_threshold: 10, 1295 | cache_mode: 'BYPASS', 1296 | }, 1297 | }); 1298 | 1299 | await expectSuccessfulCrawl(result); 1300 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1301 | expect(textContent?.text).toBeTruthy(); 1302 | // Should have filtered content 1303 | expect(textContent?.text).not.toContain('<form'); 1304 | expect(textContent?.text).not.toContain('type="submit"'); 1305 | }, 1306 | TEST_TIMEOUTS.short, 1307 | ); 1308 | 1309 | it( 1310 | 'should handle debug options with content extraction', 1311 | async () => { 1312 | const result = await client.callTool({ 1313 | name: 'crawl', 1314 | arguments: { 1315 | url: 'https://httpbin.org/html', 1316 | verbose: true, 1317 | log_console: true, 1318 | js_code: 'console.log("Debug test"); return document.title', 1319 | keep_data_attributes: true, 1320 | cache_mode: 'BYPASS', 1321 | }, 1322 | }); 1323 | 1324 | await expectSuccessfulCrawl(result); 1325 | }, 1326 | TEST_TIMEOUTS.short, 1327 | ); 1328 | }); 1329 | 1330 | describe('New crawler parameters (0.7.3/0.7.4)', () => { 1331 | it( 1332 | 'should accept undetected browser type', 1333 | async () => { 1334 | const result = await client.callTool({ 1335 | name: 'crawl', 1336 | arguments: { 1337 | url: 'https://httpbin.org/user-agent', 1338 | browser_type: 'undetected', 1339 | }, 1340 | }); 1341 | 1342 | await expectSuccessfulCrawl(result); 1343 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1344 | expect(textContent?.text).toBeTruthy(); 1345 | // The undetected browser should mask automation indicators 1346 | // but we can at least verify the request was accepted 1347 | }, 1348 | TEST_TIMEOUTS.short, 1349 | ); 1350 | 1351 | it( 1352 | 'should filter content using css_selector', 1353 | async () => { 1354 | const result = await client.callTool({ 1355 | name: 'crawl', 1356 | arguments: { 1357 | url: 'https://example.com', 1358 | css_selector: 'h1', 1359 | cache_mode: 'BYPASS', 1360 | }, 1361 | }); 1362 | 1363 | await expectSuccessfulCrawl(result); 1364 | const textContent = (result as ToolResult).content.find((c) => c.type === 'text'); 1365 | expect(textContent?.text).toBeTruthy(); 1366 | // css_selector returns ONLY the selected element content 1367 | expect(textContent?.text?.toLowerCase()).toContain('example domain'); 1368 | // Should NOT contain the paragraph text that's outside the h1 1369 | expect(textContent?.text).not.toContain('use in illustrative examples'); 1370 | }, 1371 | TEST_TIMEOUTS.short, 1372 | ); 1373 | 1374 | it( 1375 | 'should include links when include_links is true', 1376 | async () => { 1377 | const result = await client.callTool({ 1378 | name: 'crawl', 1379 | arguments: { 1380 | url: 'https://example.com', 1381 | include_links: true, 1382 | }, 1383 | }); 1384 | 1385 | await expectSuccessfulCrawl(result); 1386 | // Check if links section is included 1387 | const hasLinksInfo = (result as ToolResult).content.some( 1388 | (item) => item.type === 'text' && item.text?.includes('Links:'), 1389 | ); 1390 | expect(hasLinksInfo).toBe(true); 1391 | }, 1392 | TEST_TIMEOUTS.short, 1393 | ); 1394 | 1395 | it( 1396 | 'should respect delay_before_return_html parameter', 1397 | async () => { 1398 | const startTime = Date.now(); 1399 | 1400 | const result = await client.callTool({ 1401 | name: 'crawl', 1402 | arguments: { 1403 | url: 'https://httpbin.org/delay/1', // 1 second delay from server 1404 | delay_before_return_html: 2, // Additional 2 second delay (in seconds, not ms) 1405 | cache_mode: 'BYPASS', 1406 | }, 1407 | }); 1408 | 1409 | const elapsed = Date.now() - startTime; 1410 | 1411 | await expectSuccessfulCrawl(result); 1412 | // Total time should be at least 3 seconds (1s from endpoint + 2s delay) 1413 | expect(elapsed).toBeGreaterThanOrEqual(3000); 1414 | }, 1415 | TEST_TIMEOUTS.medium, 1416 | ); 1417 | 1418 | it( 1419 | 'should convert relative URLs when resolve_absolute_urls is true', 1420 | async () => { 1421 | const result = await client.callTool({ 1422 | name: 'crawl', 1423 | arguments: { 1424 | url: 'https://example.com', 1425 | resolve_absolute_urls: true, 1426 | include_links: true, 1427 | cache_mode: 'BYPASS', 1428 | }, 1429 | }); 1430 | 1431 | await expectSuccessfulCrawl(result); 1432 | 1433 | // Links should be in a separate content item 1434 | const linksContent = (result as ToolResult).content.find( 1435 | (c) => c.type === 'text' && c.text?.includes('Links:'), 1436 | ); 1437 | 1438 | // With include_links=true, links info should be present 1439 | expect(linksContent).toBeDefined(); 1440 | expect(linksContent?.text).toContain('External: 1'); 1441 | }, 1442 | TEST_TIMEOUTS.short, 1443 | ); 1444 | }); 1445 | }); 1446 | }); 1447 | ``` -------------------------------------------------------------------------------- /src/__tests__/index.server.test.ts: -------------------------------------------------------------------------------- ```typescript 1 | /* eslint-env jest */ 2 | import { jest } from '@jest/globals'; 3 | import { describe, it, expect, beforeEach } from '@jest/globals'; 4 | 5 | // Create mock functions 6 | const mockGetMarkdown = jest.fn(); 7 | const mockCaptureScreenshot = jest.fn(); 8 | const mockGeneratePDF = jest.fn(); 9 | const mockExecuteJS = jest.fn(); 10 | const mockGetHTML = jest.fn(); 11 | const mockBatchCrawl = jest.fn(); 12 | const mockExtractWithLLM = jest.fn(); 13 | const mockCrawl = jest.fn(); 14 | const mockParseSitemap = jest.fn(); 15 | 16 | // Mock the Crawl4AIService module 17 | jest.unstable_mockModule('../crawl4ai-service.js', () => ({ 18 | Crawl4AIService: jest.fn().mockImplementation(() => ({ 19 | getMarkdown: mockGetMarkdown, 20 | captureScreenshot: mockCaptureScreenshot, 21 | generatePDF: mockGeneratePDF, 22 | executeJS: mockExecuteJS, 23 | getHTML: mockGetHTML, 24 | batchCrawl: mockBatchCrawl, 25 | extractWithLLM: mockExtractWithLLM, 26 | crawl: mockCrawl, 27 | parseSitemap: mockParseSitemap, 28 | })), 29 | })); 30 | 31 | // Mock MCP SDK 32 | const mockSetRequestHandler = jest.fn(); 33 | const mockTool = jest.fn(); 34 | const mockConnect = jest.fn(); 35 | 36 | jest.unstable_mockModule('@modelcontextprotocol/sdk/server/index.js', () => ({ 37 | Server: jest.fn().mockImplementation(() => ({ 38 | setRequestHandler: mockSetRequestHandler, 39 | tool: mockTool, 40 | connect: mockConnect, 41 | })), 42 | })); 43 | 44 | // Mock the types module that exports the schemas 45 | const CallToolRequestSchema = { method: 'tools/call' }; 46 | const ListToolsRequestSchema = { method: 'tools/list' }; 47 | 48 | jest.unstable_mockModule('@modelcontextprotocol/sdk/types.js', () => ({ 49 | CallToolRequestSchema, 50 | ListToolsRequestSchema, 51 | })); 52 | 53 | jest.unstable_mockModule('@modelcontextprotocol/sdk/server/stdio.js', () => ({ 54 | StdioServerTransport: jest.fn(), 55 | })); 56 | 57 | // Mock axios 58 | const mockPost = jest.fn(); 59 | const mockGet = jest.fn(); 60 | const mockHead = jest.fn(); 61 | 62 | jest.unstable_mockModule('axios', () => ({ 63 | default: { 64 | create: jest.fn(() => ({ 65 | post: mockPost, 66 | get: mockGet, 67 | head: mockHead, 68 | })), 69 | get: mockGet, 70 | }, 71 | })); 72 | 73 | // Now dynamically import the modules after mocks are set up 74 | const { Crawl4AIServer } = await import('../server.js'); 75 | const { 76 | GetMarkdownSchema, 77 | CrawlSchema, 78 | BatchCrawlSchema, 79 | CaptureScreenshotSchema: _CaptureScreenshotSchema, 80 | GeneratePdfSchema: _GeneratePdfSchema, 81 | ExecuteJsSchema: _ExecuteJsSchema, 82 | ExtractWithLlmSchema: _ExtractWithLlmSchema, 83 | SmartCrawlSchema: _SmartCrawlSchema, 84 | CrawlRecursiveSchema: _CrawlRecursiveSchema, 85 | } = await import('../schemas/validation-schemas.js'); 86 | const { Crawl4AIService } = await import('../crawl4ai-service.js'); 87 | 88 | // Import types statically (these are removed at compile time) 89 | import type { 90 | MarkdownEndpointResponse, 91 | ScreenshotEndpointResponse, 92 | PDFEndpointResponse, 93 | HTMLEndpointResponse, 94 | CrawlEndpointResponse, 95 | } from '../types.js'; 96 | 97 | // Define types for test results 98 | interface ContentItem { 99 | type: string; 100 | text?: string; 101 | data?: string; 102 | resource?: { 103 | uri: string; 104 | mimeType: string; 105 | blob: string; 106 | }; 107 | } 108 | 109 | interface ToolResult { 110 | content: ContentItem[]; 111 | } 112 | 113 | type RequestHandler = (request: { method: string; params: unknown }) => Promise<ToolResult>; 114 | 115 | // Removed TestServerMethods interface - no longer needed since we use 'any' type 116 | 117 | describe('Crawl4AIServer Tool Handlers', () => { 118 | let server: any; // eslint-disable-line @typescript-eslint/no-explicit-any 119 | let requestHandler: RequestHandler; 120 | 121 | beforeEach(async () => { 122 | jest.clearAllMocks(); 123 | 124 | // Reset all mock functions 125 | mockGetMarkdown.mockReset(); 126 | mockCaptureScreenshot.mockReset(); 127 | mockGeneratePDF.mockReset(); 128 | mockExecuteJS.mockReset(); 129 | mockGetHTML.mockReset(); 130 | mockBatchCrawl.mockReset(); 131 | mockExtractWithLLM.mockReset(); 132 | mockCrawl.mockReset(); 133 | mockParseSitemap.mockReset(); 134 | mockPost.mockReset(); 135 | mockGet.mockReset(); 136 | mockHead.mockReset(); 137 | 138 | // Create server instance - the mock will be used automatically 139 | server = new Crawl4AIServer( 140 | process.env.CRAWL4AI_BASE_URL || 'http://test.example.com', 141 | process.env.CRAWL4AI_API_KEY || 'test-api-key', 142 | 'test-server', 143 | '1.0.0', 144 | ); 145 | 146 | // Start the server to register handlers 147 | await server.start(); 148 | 149 | // Get the request handler for CallToolRequestSchema 150 | const handlerCalls = mockSetRequestHandler.mock.calls; 151 | 152 | // Find the handler for CallToolRequestSchema (tools/call) 153 | for (const call of handlerCalls) { 154 | const [schema, handler] = call; 155 | if (schema && schema.method === 'tools/call') { 156 | requestHandler = handler; 157 | break; 158 | } 159 | } 160 | 161 | // Debug: Check if we found the handler 162 | if (!requestHandler) { 163 | console.log('Handler calls:', handlerCalls.length); 164 | handlerCalls.forEach((call, i) => { 165 | console.log(`Call ${i}:`, call[0], typeof call[1]); 166 | }); 167 | } 168 | }); 169 | 170 | // Add a simple test to verify mocking works 171 | it('should use the mocked service', () => { 172 | const MockedService = Crawl4AIService as jest.MockedClass<typeof Crawl4AIService>; 173 | expect(MockedService).toHaveBeenCalledTimes(1); 174 | expect(MockedService).toHaveBeenCalledWith('http://localhost:11235', 'test-api-key'); 175 | }); 176 | 177 | describe('Constructor and setup', () => { 178 | it('should initialize with correct configuration', () => { 179 | expect(server).toBeDefined(); 180 | expect(server.service).toBeDefined(); 181 | expect(server.sessions).toBeDefined(); 182 | }); 183 | 184 | it('should set up handlers on construction', () => { 185 | expect(mockSetRequestHandler).toHaveBeenCalled(); 186 | expect(mockSetRequestHandler.mock.calls.length).toBeGreaterThan(0); 187 | }); 188 | }); 189 | 190 | describe('Tool Handler Success Cases', () => { 191 | describe('get_markdown', () => { 192 | it('should handle successful markdown extraction', async () => { 193 | const mockResponse: MarkdownEndpointResponse = { 194 | url: 'https://example.com', 195 | filter: 'fit', 196 | query: null, 197 | cache: 'false', 198 | markdown: '# Example Page\n\nThis is example content.', 199 | success: true, 200 | }; 201 | 202 | mockGetMarkdown.mockResolvedValue(mockResponse); 203 | 204 | const result: ToolResult = await server.getMarkdown({ 205 | url: 'https://example.com', 206 | }); 207 | 208 | expect(result.content).toHaveLength(1); 209 | expect(result.content[0].type).toBe('text'); 210 | expect(result.content[0].text).toContain('# Example Page'); 211 | expect(result.content[0].text).toContain('URL: https://example.com'); 212 | expect(result.content[0].text).toContain('Filter: fit'); 213 | }); 214 | 215 | it('should handle markdown with query', async () => { 216 | const mockResponse: MarkdownEndpointResponse = { 217 | url: 'https://example.com', 218 | filter: 'bm25', 219 | query: 'test query', 220 | cache: 'false', 221 | markdown: 'Filtered content', 222 | success: true, 223 | }; 224 | 225 | mockGetMarkdown.mockResolvedValue(mockResponse); 226 | 227 | const result: ToolResult = await server.getMarkdown({ 228 | url: 'https://example.com', 229 | filter: 'bm25', 230 | query: 'test query', 231 | }); 232 | 233 | expect(mockGetMarkdown).toHaveBeenCalledWith({ 234 | url: 'https://example.com', 235 | f: 'bm25', 236 | q: 'test query', 237 | }); 238 | expect(result.content[0].text).toContain('Query: test query'); 239 | }); 240 | }); 241 | 242 | describe('capture_screenshot', () => { 243 | it('should handle successful screenshot capture', async () => { 244 | const mockResponse: ScreenshotEndpointResponse = { 245 | success: true, 246 | screenshot: 'base64-encoded-screenshot-data', 247 | }; 248 | 249 | mockCaptureScreenshot.mockResolvedValue(mockResponse); 250 | 251 | const result: ToolResult = await server.captureScreenshot({ 252 | url: 'https://example.com', 253 | }); 254 | 255 | expect(result.content).toHaveLength(2); 256 | expect(result.content[0].type).toBe('image'); 257 | expect(result.content[0].data).toBe('base64-encoded-screenshot-data'); 258 | expect(result.content[1].type).toBe('text'); 259 | expect(result.content[1].text).toBe('Screenshot captured for: https://example.com'); 260 | }); 261 | }); 262 | 263 | describe('generate_pdf', () => { 264 | it('should handle successful PDF generation', async () => { 265 | const mockResponse: PDFEndpointResponse = { 266 | success: true, 267 | pdf: 'base64-encoded-pdf-data', 268 | }; 269 | 270 | mockGeneratePDF.mockResolvedValue(mockResponse); 271 | 272 | const result: ToolResult = await server.generatePDF({ 273 | url: 'https://example.com', 274 | }); 275 | 276 | expect(result.content).toHaveLength(2); 277 | expect(result.content[0].type).toBe('resource'); 278 | expect(result.content[0].resource.blob).toBeDefined(); 279 | expect(result.content[1].type).toBe('text'); 280 | expect(result.content[1].text).toContain('PDF generated for: https://example.com'); 281 | }); 282 | }); 283 | 284 | describe('execute_js', () => { 285 | it('should handle successful JS execution', async () => { 286 | const mockResponse = { 287 | markdown: 'Page content', 288 | js_execution_result: { 289 | success: true, 290 | results: ['Title: Example', 'Link count: 5'], 291 | }, 292 | }; 293 | 294 | mockExecuteJS.mockResolvedValue(mockResponse); 295 | 296 | const result: ToolResult = await server.executeJS({ 297 | url: 'https://example.com', 298 | scripts: ['return document.title', 'return document.links.length'], 299 | }); 300 | 301 | expect(result.content).toHaveLength(1); 302 | expect(result.content[0].type).toBe('text'); 303 | expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); 304 | expect(result.content[0].text).toContain('Title: Example'); 305 | expect(result.content[0].text).toContain('Link count: 5'); 306 | }); 307 | 308 | it('should handle JS execution without results', async () => { 309 | const mockResponse = { 310 | markdown: 'Page content', 311 | js_execution_result: null, 312 | }; 313 | 314 | mockExecuteJS.mockResolvedValue(mockResponse); 315 | 316 | const result: ToolResult = await server.executeJS({ 317 | url: 'https://example.com', 318 | scripts: 'console.log("test")', 319 | }); 320 | 321 | expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); 322 | expect(result.content[0].text).toContain('No results returned'); 323 | }); 324 | 325 | it('should handle JS execution with error status', async () => { 326 | const mockResponse = { 327 | markdown: 'Page content', 328 | js_execution_result: { 329 | success: true, 330 | results: [ 331 | { 332 | success: false, 333 | error: 'Error: Test error', 334 | stack: 'Error: Test error\n at eval (eval at evaluate (:291:30), <anonymous>:4:43)', 335 | }, 336 | ], 337 | }, 338 | }; 339 | 340 | mockExecuteJS.mockResolvedValue(mockResponse); 341 | 342 | const result: ToolResult = await server.executeJS({ 343 | url: 'https://example.com', 344 | scripts: 'throw new Error("Test error")', 345 | }); 346 | 347 | expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); 348 | expect(result.content[0].text).toContain('Script: throw new Error("Test error")'); 349 | expect(result.content[0].text).toContain('Returned: Error: Error: Test error'); 350 | }); 351 | 352 | it('should handle JS execution with no return value', async () => { 353 | const mockResponse = { 354 | markdown: 'Page content', 355 | js_execution_result: { 356 | success: true, 357 | results: [{ success: true }], 358 | }, 359 | }; 360 | 361 | mockExecuteJS.mockResolvedValue(mockResponse); 362 | 363 | const result: ToolResult = await server.executeJS({ 364 | url: 'https://example.com', 365 | scripts: 'console.log("hello")', 366 | }); 367 | 368 | expect(result.content[0].text).toContain('JavaScript executed on: https://example.com'); 369 | expect(result.content[0].text).toContain('Returned: Executed successfully (no return value)'); 370 | }); 371 | }); 372 | 373 | describe('get_html', () => { 374 | it('should handle successful HTML retrieval', async () => { 375 | const mockResponse: HTMLEndpointResponse = { 376 | html: '<html><body><h1>Example</h1></body></html>', 377 | url: 'https://example.com', 378 | success: true, 379 | }; 380 | 381 | mockGetHTML.mockResolvedValue(mockResponse); 382 | 383 | const result: ToolResult = await server.getHTML({ 384 | url: 'https://example.com', 385 | }); 386 | 387 | expect(result.content).toHaveLength(1); 388 | expect(result.content[0].type).toBe('text'); 389 | expect(result.content[0].text).toBe('<html><body><h1>Example</h1></body></html>'); 390 | }); 391 | }); 392 | 393 | describe('batch_crawl', () => { 394 | it('should handle successful batch crawl', async () => { 395 | const mockResponse = { 396 | results: [ 397 | { url: 'https://example1.com', markdown: { raw_markdown: 'Content 1' }, success: true }, 398 | { url: 'https://example2.com', markdown: { raw_markdown: 'Content 2' }, success: true }, 399 | ], 400 | success: true, 401 | }; 402 | 403 | // Mock axios response since batchCrawl uses axiosClient directly 404 | mockPost.mockResolvedValue({ data: mockResponse }); 405 | 406 | const result: ToolResult = await server.batchCrawl({ 407 | urls: ['https://example1.com', 'https://example2.com'], 408 | }); 409 | 410 | expect(result.content).toHaveLength(1); 411 | expect(result.content[0].text).toContain('Batch crawl completed'); 412 | expect(result.content[0].text).toContain('Processed 2 URLs'); 413 | }); 414 | 415 | it('should handle batch crawl with remove_images', async () => { 416 | // Mock axios response since batchCrawl uses axiosClient directly 417 | mockPost.mockResolvedValue({ data: { results: [] } }); 418 | 419 | const result: ToolResult = await server.batchCrawl({ 420 | urls: ['https://example.com'], 421 | remove_images: true, 422 | }); 423 | 424 | expect(mockPost).toHaveBeenCalledWith('/crawl', { 425 | urls: ['https://example.com'], 426 | crawler_config: { 427 | exclude_tags: ['img', 'picture', 'svg'], 428 | }, 429 | }); 430 | expect(result.content[0].text).toContain('Batch crawl completed'); 431 | }); 432 | }); 433 | 434 | describe('crawl', () => { 435 | it('should handle successful crawl with all options', async () => { 436 | const mockResponse: CrawlEndpointResponse = { 437 | success: true, 438 | results: [ 439 | { 440 | url: 'https://example.com', 441 | html: '<html>...</html>', 442 | cleaned_html: '<html>clean</html>', 443 | fit_html: '<html>fit</html>', 444 | success: true, 445 | status_code: 200, 446 | response_headers: {}, 447 | session_id: 'test-session', 448 | metadata: { title: 'Example' }, 449 | links: { internal: [], external: [] }, 450 | media: { images: [], videos: [], audios: [] }, 451 | markdown: { 452 | raw_markdown: '# Example', 453 | markdown_with_citations: '# Example [1]', 454 | references_markdown: '[1]: https://example.com', 455 | fit_markdown: '# Example', 456 | fit_html: '<h1>Example</h1>', 457 | }, 458 | tables: [], 459 | extracted_content: null, 460 | screenshot: 'screenshot-data', 461 | pdf: 'pdf-data', 462 | mhtml: null, 463 | js_execution_result: { success: true, results: ['JS result'] }, 464 | downloaded_files: null, 465 | network_requests: null, 466 | console_messages: ['Console log'], 467 | ssl_certificate: null, 468 | dispatch_result: null, 469 | }, 470 | ], 471 | server_processing_time_s: 1.5, 472 | server_memory_delta_mb: 10, 473 | server_peak_memory_mb: 100, 474 | }; 475 | 476 | mockCrawl.mockResolvedValue(mockResponse); 477 | 478 | const result: ToolResult = await server.crawl({ 479 | url: 'https://example.com', 480 | screenshot: true, 481 | pdf: true, 482 | js_code: 'return document.title', 483 | session_id: 'test-session', 484 | }); 485 | 486 | expect(result.content.length).toBeGreaterThan(0); // Multiple content types 487 | // Check text content 488 | const textContent = result.content.find((c) => c.type === 'text' && c.text?.includes('# Example')); 489 | expect(textContent).toBeDefined(); 490 | // Check screenshot 491 | const screenshotContent = result.content.find((c) => c.type === 'image'); 492 | expect(screenshotContent?.data).toBe('screenshot-data'); 493 | }); 494 | 495 | it('should handle crawl with proxy configuration', async () => { 496 | const mockResponse: CrawlEndpointResponse = { 497 | success: true, 498 | results: [ 499 | { 500 | url: 'https://example.com', 501 | markdown: { raw_markdown: 'Proxied content' }, 502 | success: true, 503 | status_code: 200, 504 | }, 505 | ], 506 | }; 507 | 508 | mockCrawl.mockResolvedValue(mockResponse); 509 | 510 | await server.crawl({ 511 | url: 'https://example.com', 512 | proxy_server: 'http://proxy.example.com:8080', 513 | proxy_username: 'user', 514 | proxy_password: 'pass', 515 | }); 516 | 517 | expect(mockCrawl).toHaveBeenCalledWith( 518 | expect.objectContaining({ 519 | browser_config: expect.objectContaining({ 520 | proxy_config: { 521 | server: 'http://proxy.example.com:8080', 522 | username: 'user', 523 | password: 'pass', 524 | }, 525 | }), 526 | }), 527 | ); 528 | }); 529 | 530 | it('should handle crawl with cookies and headers', async () => { 531 | const mockResponse: CrawlEndpointResponse = { 532 | success: true, 533 | results: [ 534 | { 535 | url: 'https://example.com', 536 | markdown: { raw_markdown: 'Content with auth' }, 537 | success: true, 538 | status_code: 200, 539 | }, 540 | ], 541 | }; 542 | 543 | mockCrawl.mockResolvedValue(mockResponse); 544 | 545 | await server.crawl({ 546 | url: 'https://example.com', 547 | cookies: [{ name: 'session', value: 'abc123' }], 548 | headers: { Authorization: 'Bearer token123' }, 549 | }); 550 | 551 | expect(mockCrawl).toHaveBeenCalledWith( 552 | expect.objectContaining({ 553 | browser_config: expect.objectContaining({ 554 | cookies: [{ name: 'session', value: 'abc123' }], 555 | headers: { Authorization: 'Bearer token123' }, 556 | }), 557 | }), 558 | ); 559 | }); 560 | 561 | it('should handle virtual scroll configuration', async () => { 562 | const mockResponse: CrawlEndpointResponse = { 563 | success: true, 564 | results: [ 565 | { 566 | url: 'https://example.com', 567 | markdown: { raw_markdown: 'Scrolled content' }, 568 | success: true, 569 | status_code: 200, 570 | }, 571 | ], 572 | }; 573 | 574 | mockCrawl.mockResolvedValue(mockResponse); 575 | 576 | await server.crawl({ 577 | url: 'https://example.com', 578 | virtual_scroll_config: { 579 | enabled: true, 580 | scroll_step: 100, 581 | max_scrolls: 10, 582 | }, 583 | }); 584 | 585 | expect(mockCrawl).toHaveBeenCalledWith( 586 | expect.objectContaining({ 587 | crawler_config: expect.objectContaining({ 588 | virtual_scroll_config: { 589 | enabled: true, 590 | scroll_step: 100, 591 | max_scrolls: 10, 592 | }, 593 | }), 594 | }), 595 | ); 596 | }); 597 | 598 | it('should handle js_code as null error', async () => { 599 | await expect( 600 | server.crawl({ 601 | url: 'https://example.com', 602 | js_code: null, 603 | }), 604 | ).rejects.toThrow('js_code parameter is null'); 605 | }); 606 | }); 607 | 608 | describe('extract_with_llm', () => { 609 | it('should handle successful LLM extraction', async () => { 610 | mockExtractWithLLM.mockResolvedValue({ 611 | answer: 'The main topic is JavaScript testing.', 612 | }); 613 | 614 | const result: ToolResult = await server.extractWithLLM({ 615 | url: 'https://example.com', 616 | query: 'What is the main topic?', 617 | }); 618 | 619 | expect(result.content).toHaveLength(1); 620 | expect(result.content[0].text).toBe('The main topic is JavaScript testing.'); 621 | }); 622 | }); 623 | 624 | describe('extract_links', () => { 625 | it('should extract and categorize links', async () => { 626 | mockPost.mockResolvedValue({ 627 | data: { 628 | results: [ 629 | { 630 | links: { 631 | internal: [ 632 | { href: '/page1', text: 'Page 1' }, 633 | { href: '/page2', text: 'Page 2' }, 634 | ], 635 | external: [{ href: 'https://external.com', text: 'External' }], 636 | }, 637 | }, 638 | ], 639 | }, 640 | }); 641 | 642 | const result: ToolResult = await server.extractLinks({ 643 | url: 'https://example.com', 644 | categorize: true, 645 | }); 646 | 647 | expect(result.content[0].text).toContain('Link analysis for https://example.com:'); 648 | expect(result.content[0].text).toContain('internal (2)'); 649 | expect(result.content[0].text).toContain('/page1'); 650 | expect(result.content[0].text).toContain('external (1)'); 651 | }); 652 | 653 | it('should categorize external links (social, images, scripts)', async () => { 654 | mockPost.mockResolvedValue({ 655 | data: { 656 | results: [ 657 | { 658 | links: { 659 | internal: [], 660 | external: [ 661 | 'https://facebook.com/profile', 662 | 'https://example.com/image.jpg', 663 | 'https://cdn.com/script.js', 664 | ], 665 | }, 666 | }, 667 | ], 668 | }, 669 | }); 670 | 671 | const result: ToolResult = await server.extractLinks({ 672 | url: 'https://example.com', 673 | categorize: true, 674 | }); 675 | 676 | expect(result.content[0].text).toContain('social (1)'); 677 | expect(result.content[0].text).toContain('images (1)'); 678 | expect(result.content[0].text).toContain('scripts (1)'); 679 | expect(result.content[0].text).toContain('external (0)'); 680 | }); 681 | }); 682 | 683 | describe('crawl_recursive', () => { 684 | it('should crawl recursively with depth limit', async () => { 685 | // Ensure mock is clean before setting up 686 | mockPost.mockReset(); 687 | 688 | mockPost 689 | .mockResolvedValueOnce({ 690 | data: { 691 | results: [ 692 | { 693 | url: 'https://example.com', 694 | links: { 695 | internal: [{ href: 'https://example.com/page1', text: 'Page 1' }], 696 | }, 697 | markdown: { raw_markdown: 'Home page' }, 698 | success: true, 699 | }, 700 | ], 701 | }, 702 | }) 703 | .mockResolvedValueOnce({ 704 | data: { 705 | results: [ 706 | { 707 | url: 'https://example.com/page1', 708 | links: { internal: [] }, 709 | markdown: { raw_markdown: 'Page 1 content' }, 710 | success: true, 711 | }, 712 | ], 713 | }, 714 | }); 715 | 716 | const result: ToolResult = await server.crawlRecursive({ 717 | url: 'https://example.com', 718 | max_depth: 2, 719 | }); 720 | 721 | expect(result.content[0].text).toContain('Recursive crawl completed:'); 722 | expect(result.content[0].text).toContain('Pages crawled: 2'); 723 | expect(result.content[0].text).toContain('https://example.com'); 724 | expect(result.content[0].text).toContain('https://example.com/page1'); 725 | }); 726 | }); 727 | 728 | describe('parse_sitemap', () => { 729 | it('should parse sitemap successfully', async () => { 730 | mockGet.mockResolvedValue({ 731 | data: `<?xml version="1.0" encoding="UTF-8"?> 732 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 733 | <url><loc>https://example.com/</loc></url> 734 | <url><loc>https://example.com/page1</loc></url> 735 | <url><loc>https://example.com/page2</loc></url> 736 | </urlset>`, 737 | }); 738 | 739 | const result: ToolResult = await server.parseSitemap({ 740 | url: 'https://example.com/sitemap.xml', 741 | }); 742 | 743 | expect(result.content[0].text).toContain('Sitemap parsed successfully:'); 744 | expect(result.content[0].text).toContain('Total URLs found: 3'); 745 | expect(result.content[0].text).toContain('https://example.com/'); 746 | expect(result.content[0].text).toContain('https://example.com/page1'); 747 | }); 748 | }); 749 | 750 | describe('smart_crawl', () => { 751 | it('should handle smart crawl for HTML content', async () => { 752 | mockHead.mockResolvedValue({ 753 | headers: { 'content-type': 'text/html' }, 754 | }); 755 | mockPost.mockResolvedValue({ 756 | data: { 757 | results: [ 758 | { 759 | markdown: { raw_markdown: 'HTML content' }, 760 | links: { internal: [], external: [] }, 761 | }, 762 | ], 763 | }, 764 | }); 765 | 766 | const result: ToolResult = await server.smartCrawl({ 767 | url: 'https://example.com', 768 | }); 769 | 770 | expect(result.content[0].text).toContain('Smart crawl detected content type'); 771 | // Already contains 'Smart crawl detected content type' 772 | }); 773 | 774 | it('should handle smart crawl for PDF content', async () => { 775 | mockHead.mockResolvedValue({ 776 | headers: { 'content-type': 'application/pdf' }, 777 | }); 778 | 779 | // Mock the crawl response for PDF 780 | mockPost.mockResolvedValue({ 781 | data: { 782 | results: [ 783 | { 784 | markdown: { raw_markdown: 'PDF content extracted' }, 785 | links: { internal: [], external: [] }, 786 | }, 787 | ], 788 | }, 789 | }); 790 | 791 | const result: ToolResult = await server.smartCrawl({ 792 | url: 'https://example.com/doc.pdf', 793 | }); 794 | 795 | expect(result.content[0].text).toContain('Smart crawl detected content type'); 796 | expect(result.content[0].text).toContain('PDF content extracted'); 797 | }); 798 | }); 799 | }); 800 | 801 | describe('Tool Handler Error Cases', () => { 802 | describe('Service errors', () => { 803 | it('should handle service error for get_markdown', async () => { 804 | mockGetMarkdown.mockRejectedValue(new Error('Network error')); 805 | 806 | await expect(server.getMarkdown({ url: 'https://example.com' })).rejects.toThrow( 807 | 'Failed to get markdown: Network error', 808 | ); 809 | }); 810 | 811 | it('should handle axios error with response detail', async () => { 812 | const axiosError = { 813 | response: { 814 | data: { 815 | detail: 'Invalid API key', 816 | }, 817 | }, 818 | }; 819 | mockCaptureScreenshot.mockRejectedValue(axiosError); 820 | 821 | await expect(server.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 822 | 'Failed to capture screenshot: Invalid API key', 823 | ); 824 | }); 825 | 826 | it('should handle missing screenshot data', async () => { 827 | mockCaptureScreenshot.mockResolvedValue({ 828 | success: false, 829 | screenshot: '', 830 | }); 831 | 832 | await expect(server.captureScreenshot({ url: 'https://example.com' })).rejects.toThrow( 833 | 'Screenshot capture failed - no screenshot data in response', 834 | ); 835 | }); 836 | 837 | it('should handle missing PDF data', async () => { 838 | mockGeneratePDF.mockResolvedValue({ 839 | success: true, 840 | pdf: '', 841 | }); 842 | 843 | await expect(server.generatePDF({ url: 'https://example.com' })).rejects.toThrow( 844 | 'PDF generation failed - no PDF data in response', 845 | ); 846 | }); 847 | }); 848 | 849 | describe('Validation errors', () => { 850 | it('should handle missing scripts for execute_js', async () => { 851 | await expect( 852 | server.executeJS({ url: 'https://example.com', scripts: null as unknown as string }), 853 | ).rejects.toThrow('scripts is required'); 854 | }); 855 | 856 | it('should handle empty crawl options', async () => { 857 | await expect(server.crawl(null as unknown as Parameters<typeof server.crawl>[0])).rejects.toThrow( 858 | 'crawl requires options object with at least a url parameter', 859 | ); 860 | }); 861 | 862 | it('should handle crawl_recursive errors', async () => { 863 | // Setup the mock to fail - crawlRecursive catches the error internally 864 | mockPost.mockRejectedValue(new Error('API error')); 865 | 866 | const result: ToolResult = await server.crawlRecursive({ url: 'https://example.com' }); 867 | 868 | // The method catches errors and returns a message about no pages crawled 869 | expect(result.content[0].text).toContain('Pages crawled: 0'); 870 | expect(result.content[0].text).toContain('No pages could be crawled'); 871 | }); 872 | 873 | it('should handle parse_sitemap errors', async () => { 874 | mockGet.mockRejectedValue(new Error('Failed to fetch sitemap')); 875 | 876 | await expect(server.parseSitemap({ url: 'https://example.com/sitemap.xml' })).rejects.toThrow( 877 | 'Failed to parse sitemap: Failed to fetch sitemap', 878 | ); 879 | }); 880 | }); 881 | 882 | describe('Edge cases', () => { 883 | it('should handle batch crawl with no results', async () => { 884 | mockPost.mockResolvedValue({ 885 | data: { 886 | results: [], 887 | }, 888 | }); 889 | 890 | const result: ToolResult = await server.batchCrawl({ 891 | urls: ['https://example.com'], 892 | }); 893 | 894 | expect(result.content[0].text).toContain('Batch crawl completed'); 895 | expect(result.content[0].text).toContain('Processed 0 URLs'); 896 | }); 897 | 898 | it('should handle extract_links with no links', async () => { 899 | mockPost.mockResolvedValue({ 900 | data: { 901 | results: [ 902 | { 903 | links: { 904 | internal: [], 905 | external: [], 906 | }, 907 | }, 908 | ], 909 | }, 910 | }); 911 | 912 | const result: ToolResult = await server.extractLinks({ 913 | url: 'https://example.com', 914 | }); 915 | 916 | expect(result.content[0].text).toContain('All links from https://example.com:'); 917 | expect(result.content[0].text).toMatch(/\n\s*$/); 918 | }); 919 | 920 | it('should handle smart crawl with HEAD request failure', async () => { 921 | mockHead.mockRejectedValue(new Error('HEAD failed')); 922 | // Fallback to HTML crawl 923 | mockPost.mockResolvedValue({ 924 | data: { 925 | results: [ 926 | { 927 | markdown: { raw_markdown: 'Fallback content' }, 928 | links: { internal: [], external: [] }, 929 | }, 930 | ], 931 | }, 932 | }); 933 | 934 | const result: ToolResult = await server.smartCrawl({ 935 | url: 'https://example.com', 936 | }); 937 | 938 | expect(result.content[0].text).toContain('Smart crawl detected content type'); 939 | }); 940 | }); 941 | 942 | describe('ZodError validation tests', () => { 943 | it('should validate get_markdown parameters', () => { 944 | // Valid case 945 | expect(() => { 946 | GetMarkdownSchema.parse({ url: 'https://example.com' }); 947 | }).not.toThrow(); 948 | 949 | // Invalid - missing url 950 | expect(() => { 951 | GetMarkdownSchema.parse({ filter: 'fit' }); 952 | }).toThrow(); 953 | 954 | // Invalid - bm25 without query 955 | expect(() => { 956 | GetMarkdownSchema.parse({ url: 'https://example.com', filter: 'bm25' }); 957 | }).toThrow('Query parameter is required when using bm25 or llm filter'); 958 | }); 959 | 960 | it('should validate crawl parameters', () => { 961 | // Valid case 962 | expect(() => { 963 | CrawlSchema.parse({ url: 'https://example.com' }); 964 | }).not.toThrow(); 965 | 966 | // Invalid - js_only without session_id 967 | expect(() => { 968 | CrawlSchema.parse({ url: 'https://example.com', js_only: true }); 969 | }).toThrow('js_only requires session_id'); 970 | 971 | // Invalid - empty js_code array 972 | expect(() => { 973 | CrawlSchema.parse({ url: 'https://example.com', js_code: [] }); 974 | }).toThrow('js_code array cannot be empty'); 975 | }); 976 | 977 | it('should validate batch_crawl parameters', () => { 978 | // Valid case 979 | expect(() => { 980 | BatchCrawlSchema.parse({ urls: ['https://example.com'] }); 981 | }).not.toThrow(); 982 | 983 | // Invalid - not an array 984 | expect(() => { 985 | BatchCrawlSchema.parse({ urls: 'not-an-array' }); 986 | }).toThrow(); 987 | }); 988 | }); 989 | 990 | describe('Parameter validation edge cases', () => { 991 | // These tests require proper schema validation which happens at the handler level 992 | // Skipping direct method calls as they bypass validation 993 | }); 994 | 995 | describe('Additional coverage tests', () => { 996 | it('should handle crawl with media extraction', async () => { 997 | mockCrawl.mockResolvedValue({ 998 | success: true, 999 | results: [ 1000 | { 1001 | url: 'https://example.com', 1002 | markdown: { raw_markdown: 'Content' }, 1003 | media: { 1004 | images: [ 1005 | { src: 'https://example.com/img1.jpg', alt: 'Image 1' }, 1006 | { src: 'https://example.com/img2.jpg', alt: 'Image 2' }, 1007 | ], 1008 | videos: [{ src: 'https://example.com/video.mp4', type: 'video/mp4' }], 1009 | audios: [], 1010 | }, 1011 | success: true, 1012 | status_code: 200, 1013 | }, 1014 | ], 1015 | }); 1016 | 1017 | const result: ToolResult = await server.crawl({ 1018 | url: 'https://example.com', 1019 | media_handling: { images: true, videos: true }, 1020 | }); 1021 | 1022 | expect(result.content.length).toBeGreaterThan(0); 1023 | expect(result.content[0].type).toBe('text'); 1024 | expect(result.content[0].text).toBe('Content'); 1025 | }); 1026 | 1027 | it('should handle crawl with tables extraction', async () => { 1028 | mockCrawl.mockResolvedValue({ 1029 | success: true, 1030 | results: [ 1031 | { 1032 | url: 'https://example.com', 1033 | markdown: { raw_markdown: 'Content' }, 1034 | tables: [ 1035 | { 1036 | headers: ['Name', 'Age'], 1037 | rows: [ 1038 | ['John', '30'], 1039 | ['Jane', '25'], 1040 | ], 1041 | markdown: '| Name | Age |\n|------|-----|\n| John | 30 |\n| Jane | 25 |', 1042 | }, 1043 | ], 1044 | success: true, 1045 | status_code: 200, 1046 | }, 1047 | ], 1048 | }); 1049 | 1050 | const result: ToolResult = await server.crawl({ 1051 | url: 'https://example.com', 1052 | }); 1053 | 1054 | expect(result.content.length).toBeGreaterThan(0); 1055 | expect(result.content[0].type).toBe('text'); 1056 | expect(result.content[0].text).toBe('Content'); 1057 | }); 1058 | 1059 | it('should handle crawl with network_requests', async () => { 1060 | mockCrawl.mockResolvedValue({ 1061 | success: true, 1062 | results: [ 1063 | { 1064 | url: 'https://example.com', 1065 | markdown: { raw_markdown: 'Content' }, 1066 | network_requests: [ 1067 | { url: 'https://api.example.com/data', method: 'GET', status: 200 }, 1068 | { url: 'https://api.example.com/post', method: 'POST', status: 201 }, 1069 | ], 1070 | success: true, 1071 | status_code: 200, 1072 | }, 1073 | ], 1074 | }); 1075 | 1076 | const result: ToolResult = await server.crawl({ 1077 | url: 'https://example.com', 1078 | network_requests: true, 1079 | }); 1080 | 1081 | expect(result.content.length).toBeGreaterThan(0); 1082 | expect(result.content[0].type).toBe('text'); 1083 | expect(result.content[0].text).toBe('Content'); 1084 | }); 1085 | 1086 | it('should handle crawl with mhtml output', async () => { 1087 | mockCrawl.mockResolvedValue({ 1088 | success: true, 1089 | results: [ 1090 | { 1091 | url: 'https://example.com', 1092 | markdown: { raw_markdown: 'Content' }, 1093 | mhtml: 'MHTML content here', 1094 | success: true, 1095 | status_code: 200, 1096 | }, 1097 | ], 1098 | }); 1099 | 1100 | const result: ToolResult = await server.crawl({ 1101 | url: 'https://example.com', 1102 | mhtml: true, 1103 | }); 1104 | 1105 | expect(result.content.length).toBeGreaterThan(0); 1106 | expect(result.content[0].type).toBe('text'); 1107 | expect(result.content[0].text).toBe('Content'); 1108 | }); 1109 | 1110 | it('should handle crawl with downloaded_files', async () => { 1111 | mockCrawl.mockResolvedValue({ 1112 | success: true, 1113 | results: [ 1114 | { 1115 | url: 'https://example.com', 1116 | markdown: { raw_markdown: 'Content' }, 1117 | downloaded_files: { 1118 | 'file1.pdf': 'base64content1', 1119 | 'file2.doc': 'base64content2', 1120 | }, 1121 | success: true, 1122 | status_code: 200, 1123 | }, 1124 | ], 1125 | }); 1126 | 1127 | const result: ToolResult = await server.crawl({ 1128 | url: 'https://example.com', 1129 | download_files: true, 1130 | }); 1131 | 1132 | expect(result.content.length).toBeGreaterThan(0); 1133 | expect(result.content[0].type).toBe('text'); 1134 | expect(result.content[0].text).toBe('Content'); 1135 | }); 1136 | 1137 | it('should handle crawl with ssl_certificate', async () => { 1138 | mockCrawl.mockResolvedValue({ 1139 | success: true, 1140 | results: [ 1141 | { 1142 | url: 'https://example.com', 1143 | markdown: { raw_markdown: 'Content' }, 1144 | ssl_certificate: { 1145 | issuer: "Let's Encrypt", 1146 | subject: '*.example.com', 1147 | validFrom: '2024-01-01', 1148 | validTo: '2024-12-31', 1149 | protocol: 'TLSv1.3', 1150 | }, 1151 | success: true, 1152 | status_code: 200, 1153 | }, 1154 | ], 1155 | }); 1156 | 1157 | const result: ToolResult = await server.crawl({ 1158 | url: 'https://example.com', 1159 | ssl_certificate: true, 1160 | }); 1161 | 1162 | expect(result.content.length).toBeGreaterThan(0); 1163 | expect(result.content[0].type).toBe('text'); 1164 | expect(result.content[0].text).toBe('Content'); 1165 | }); 1166 | 1167 | it('should handle crawl with wait_for conditions', async () => { 1168 | mockCrawl.mockResolvedValue({ 1169 | success: true, 1170 | results: [ 1171 | { 1172 | url: 'https://example.com', 1173 | markdown: { raw_markdown: 'Dynamic content loaded' }, 1174 | success: true, 1175 | status_code: 200, 1176 | }, 1177 | ], 1178 | }); 1179 | 1180 | await server.crawl({ 1181 | url: 'https://example.com', 1182 | wait_for: { 1183 | selector: '.dynamic-content', 1184 | timeout: 5000, 1185 | }, 1186 | }); 1187 | 1188 | expect(mockCrawl).toHaveBeenCalledWith( 1189 | expect.objectContaining({ 1190 | crawler_config: expect.objectContaining({ 1191 | wait_for: { 1192 | selector: '.dynamic-content', 1193 | timeout: 5000, 1194 | }, 1195 | }), 1196 | }), 1197 | ); 1198 | }); 1199 | 1200 | it('should handle crawl error scenarios', async () => { 1201 | mockCrawl.mockResolvedValue({ 1202 | success: false, 1203 | results: [ 1204 | { 1205 | url: 'https://example.com', 1206 | success: false, 1207 | error: 'Page load timeout', 1208 | status_code: 0, 1209 | }, 1210 | ], 1211 | }); 1212 | 1213 | const result: ToolResult = await server.crawl({ 1214 | url: 'https://example.com', 1215 | }); 1216 | 1217 | expect(result.content[0].text).toBe('No content extracted'); 1218 | }); 1219 | 1220 | it('should handle extract_links with categorized output', async () => { 1221 | mockPost.mockResolvedValue({ 1222 | data: { 1223 | results: [ 1224 | { 1225 | links: { 1226 | internal: [ 1227 | { href: '/page1', text: 'Page 1' }, 1228 | { href: '/page2', text: 'Page 2' }, 1229 | ], 1230 | external: [{ href: 'https://external.com', text: 'External' }], 1231 | social: [{ href: 'https://twitter.com/example', text: 'Twitter' }], 1232 | documents: [{ href: '/file.pdf', text: 'PDF Document' }], 1233 | images: [{ href: '/image.jpg', text: 'Image' }], 1234 | }, 1235 | }, 1236 | ], 1237 | }, 1238 | }); 1239 | 1240 | const result: ToolResult = await server.extractLinks({ 1241 | url: 'https://example.com', 1242 | categorize: true, 1243 | }); 1244 | 1245 | expect(result.content[0].text).toContain('internal (2)'); 1246 | expect(result.content[0].text).toContain('external (1)'); 1247 | expect(result.content[0].text).toContain('social (0)'); // No social links in internal/external 1248 | expect(result.content[0].text).toContain('documents (0)'); // No documents in internal/external 1249 | expect(result.content[0].text).toContain('images (0)'); // No images in internal/external 1250 | }); 1251 | 1252 | it('should handle smart_crawl for sitemap', async () => { 1253 | // Set up axios client mock for the server instance 1254 | const axiosClientMock = { 1255 | head: jest.fn().mockResolvedValue({ 1256 | headers: { 'content-type': 'application/xml' }, 1257 | }), 1258 | post: jest.fn().mockResolvedValue({ 1259 | data: { 1260 | results: [ 1261 | { 1262 | url: 'https://example.com/sitemap.xml', 1263 | markdown: { raw_markdown: 'Sitemap content' }, 1264 | success: true, 1265 | status_code: 200, 1266 | }, 1267 | ], 1268 | }, 1269 | }), 1270 | }; 1271 | server.axiosClientForTesting = axiosClientMock; 1272 | 1273 | const result: ToolResult = await server.smartCrawl({ 1274 | url: 'https://example.com/sitemap.xml', 1275 | }); 1276 | 1277 | expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); 1278 | expect(result.content[0].text).toContain('Sitemap content'); 1279 | expect(axiosClientMock.post).toHaveBeenCalledWith( 1280 | '/crawl', 1281 | expect.objectContaining({ 1282 | urls: ['https://example.com/sitemap.xml'], 1283 | crawler_config: expect.objectContaining({ 1284 | cache_mode: 'ENABLED', 1285 | }), 1286 | browser_config: expect.objectContaining({ 1287 | headless: true, 1288 | browser_type: 'chromium', 1289 | }), 1290 | }), 1291 | ); 1292 | }); 1293 | 1294 | it('should handle smart_crawl for RSS feed', async () => { 1295 | const axiosClientMock = { 1296 | head: jest.fn().mockResolvedValue({ 1297 | headers: { 'content-type': 'application/rss+xml' }, 1298 | }), 1299 | post: jest.fn().mockResolvedValue({ 1300 | data: { 1301 | results: [ 1302 | { 1303 | url: 'https://example.com/feed.rss', 1304 | markdown: { raw_markdown: 'RSS feed content' }, 1305 | success: true, 1306 | status_code: 200, 1307 | }, 1308 | ], 1309 | }, 1310 | }), 1311 | }; 1312 | server.axiosClientForTesting = axiosClientMock; 1313 | 1314 | const result: ToolResult = await server.smartCrawl({ 1315 | url: 'https://example.com/feed.rss', 1316 | }); 1317 | 1318 | expect(result.content[0].text).toContain('Smart crawl detected content type: rss'); 1319 | expect(result.content[0].text).toContain('RSS feed content'); 1320 | expect(axiosClientMock.post).toHaveBeenCalledWith( 1321 | '/crawl', 1322 | expect.objectContaining({ 1323 | urls: ['https://example.com/feed.rss'], 1324 | crawler_config: expect.objectContaining({ 1325 | cache_mode: 'ENABLED', 1326 | }), 1327 | browser_config: expect.objectContaining({ 1328 | headless: true, 1329 | browser_type: 'chromium', 1330 | }), 1331 | }), 1332 | ); 1333 | }); 1334 | 1335 | it('should handle smart_crawl for JSON content', async () => { 1336 | const axiosClientMock = { 1337 | head: jest.fn().mockResolvedValue({ 1338 | headers: { 'content-type': 'application/json' }, 1339 | }), 1340 | post: jest.fn().mockResolvedValue({ 1341 | data: { 1342 | results: [ 1343 | { 1344 | url: 'https://example.com/data.json', 1345 | markdown: { raw_markdown: 'JSON content' }, 1346 | success: true, 1347 | status_code: 200, 1348 | }, 1349 | ], 1350 | }, 1351 | }), 1352 | }; 1353 | server.axiosClientForTesting = axiosClientMock; 1354 | 1355 | const result: ToolResult = await server.smartCrawl({ 1356 | url: 'https://example.com/data.json', 1357 | }); 1358 | 1359 | expect(result.content[0].text).toContain('Smart crawl detected content type: json'); 1360 | expect(result.content[0].text).toContain('JSON content'); 1361 | expect(axiosClientMock.post).toHaveBeenCalledWith( 1362 | '/crawl', 1363 | expect.objectContaining({ 1364 | urls: ['https://example.com/data.json'], 1365 | crawler_config: expect.objectContaining({ 1366 | cache_mode: 'ENABLED', 1367 | }), 1368 | browser_config: expect.objectContaining({ 1369 | headless: true, 1370 | browser_type: 'chromium', 1371 | }), 1372 | }), 1373 | ); 1374 | }); 1375 | 1376 | it('should correctly categorize internal documents and images', async () => { 1377 | mockPost.mockResolvedValue({ 1378 | data: { 1379 | results: [ 1380 | { 1381 | links: { 1382 | internal: [ 1383 | { href: '/page1', text: 'Page 1' }, 1384 | { href: '/docs/manual.pdf', text: 'Manual' }, 1385 | { href: '/images/logo.png', text: 'Logo' }, 1386 | { href: '/assets/style.css', text: 'Styles' }, 1387 | ], 1388 | external: [{ href: 'https://example.com/report.pdf', text: 'External Report' }], 1389 | }, 1390 | }, 1391 | ], 1392 | }, 1393 | }); 1394 | 1395 | const result: ToolResult = await server.extractLinks({ 1396 | url: 'https://example.com', 1397 | categorize: true, 1398 | }); 1399 | 1400 | expect(result.content[0].text).toContain('internal (1)'); // Only /page1 remains as internal 1401 | expect(result.content[0].text).toContain('external (0)'); // External PDF moved to documents 1402 | expect(result.content[0].text).toContain('documents (2)'); // Both PDFs 1403 | expect(result.content[0].text).toContain('images (1)'); // The PNG 1404 | expect(result.content[0].text).toContain('scripts (1)'); // The CSS 1405 | }); 1406 | 1407 | it('should handle smart_crawl for plain text', async () => { 1408 | const axiosClientMock = { 1409 | head: jest.fn().mockResolvedValue({ 1410 | headers: { 'content-type': 'text/plain' }, 1411 | }), 1412 | post: jest.fn().mockResolvedValue({ 1413 | data: { 1414 | results: [ 1415 | { 1416 | url: 'https://example.com/file.txt', 1417 | markdown: { raw_markdown: 'This is plain text content' }, 1418 | success: true, 1419 | status_code: 200, 1420 | }, 1421 | ], 1422 | }, 1423 | }), 1424 | }; 1425 | server.axiosClientForTesting = axiosClientMock; 1426 | 1427 | const result: ToolResult = await server.smartCrawl({ 1428 | url: 'https://example.com/file.txt', 1429 | }); 1430 | 1431 | expect(result.content[0].text).toContain('Smart crawl detected content type: text'); 1432 | expect(result.content[0].text).toContain('This is plain text content'); 1433 | expect(axiosClientMock.post).toHaveBeenCalledWith( 1434 | '/crawl', 1435 | expect.objectContaining({ 1436 | urls: ['https://example.com/file.txt'], 1437 | crawler_config: expect.objectContaining({ 1438 | cache_mode: 'ENABLED', 1439 | }), 1440 | browser_config: expect.objectContaining({ 1441 | headless: true, 1442 | browser_type: 'chromium', 1443 | }), 1444 | }), 1445 | ); 1446 | }); 1447 | }); 1448 | 1449 | describe('Additional Method Tests', () => { 1450 | it('should handle parse_sitemap', async () => { 1451 | // Mock axios.get to return sitemap XML 1452 | mockGet.mockResolvedValue({ 1453 | data: `<?xml version="1.0" encoding="UTF-8"?> 1454 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 1455 | <url><loc>https://example.com/page1</loc></url> 1456 | <url><loc>https://example.com/page2</loc></url> 1457 | <url><loc>https://example.com/page3</loc></url> 1458 | </urlset>`, 1459 | }); 1460 | 1461 | const result: ToolResult = await server.parseSitemap({ 1462 | url: 'https://example.com/sitemap.xml', 1463 | }); 1464 | 1465 | expect(result.content[0].text).toContain('Sitemap parsed successfully'); 1466 | expect(result.content[0].text).toContain('Total URLs found: 3'); 1467 | }); 1468 | 1469 | it('should handle parse_sitemap with filter', async () => { 1470 | // Mock axios.get to return sitemap XML with blog URLs 1471 | mockGet.mockResolvedValue({ 1472 | data: `<?xml version="1.0" encoding="UTF-8"?> 1473 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> 1474 | <url><loc>https://example.com/page1</loc></url> 1475 | <url><loc>https://example.com/blog/post1</loc></url> 1476 | <url><loc>https://example.com/blog/post2</loc></url> 1477 | <url><loc>https://example.com/page2</loc></url> 1478 | </urlset>`, 1479 | }); 1480 | 1481 | const result: ToolResult = await server.parseSitemap({ 1482 | url: 'https://example.com/sitemap.xml', 1483 | filter_pattern: '.*blog.*', 1484 | }); 1485 | 1486 | expect(result.content[0].text).toContain('Total URLs found: 4'); 1487 | expect(result.content[0].text).toContain('Filtered URLs: 2'); 1488 | }); 1489 | 1490 | it('should handle crawl_recursive', async () => { 1491 | mockCrawl.mockResolvedValue({ 1492 | success: true, 1493 | results: [ 1494 | { 1495 | url: 'https://example.com', 1496 | markdown: { raw_markdown: 'Content' }, 1497 | links: { internal: [], external: [] }, 1498 | success: true, 1499 | status_code: 200, 1500 | }, 1501 | ], 1502 | }); 1503 | 1504 | const result: ToolResult = await server.crawlRecursive({ 1505 | url: 'https://example.com', 1506 | }); 1507 | 1508 | expect(result.content[0].text).toContain('Recursive crawl completed'); 1509 | }); 1510 | 1511 | it('should handle parse_sitemap error', async () => { 1512 | mockParseSitemap.mockRejectedValue(new Error('Network error')); 1513 | 1514 | await expect( 1515 | server.parseSitemap({ 1516 | url: 'https://example.com/sitemap.xml', 1517 | }), 1518 | ).rejects.toThrow('Failed to parse sitemap'); 1519 | }); 1520 | 1521 | it('should handle crawl with error result', async () => { 1522 | mockCrawl.mockResolvedValue({ 1523 | success: false, 1524 | results: [], 1525 | }); 1526 | 1527 | await expect( 1528 | server.crawl({ 1529 | url: 'https://example.com', 1530 | }), 1531 | ).rejects.toThrow('Invalid response from server'); 1532 | }); 1533 | 1534 | it('should handle crawl with metadata and links', async () => { 1535 | mockCrawl.mockResolvedValue({ 1536 | success: true, 1537 | results: [ 1538 | { 1539 | url: 'https://example.com', 1540 | markdown: { raw_markdown: 'Content' }, 1541 | metadata: { title: 'Test Page', description: 'Test' }, 1542 | links: { internal: ['/page1'], external: ['https://external.com'] }, 1543 | js_execution_result: { results: [42, 'test'] }, 1544 | success: true, 1545 | status_code: 200, 1546 | }, 1547 | ], 1548 | }); 1549 | 1550 | const result: ToolResult = await server.crawl({ 1551 | url: 'https://example.com', 1552 | }); 1553 | 1554 | expect(result.content.length).toBeGreaterThan(1); 1555 | expect(result.content.some((c) => c.text?.includes('Metadata'))).toBe(true); 1556 | expect(result.content.some((c) => c.text?.includes('Links'))).toBe(true); 1557 | expect(result.content.some((c) => c.text?.includes('JavaScript Execution Results'))).toBe(true); 1558 | }); 1559 | 1560 | it('should handle executeJS with no scripts', async () => { 1561 | await expect( 1562 | server.executeJS({ 1563 | url: 'https://example.com', 1564 | scripts: null, 1565 | }), 1566 | ).rejects.toThrow('scripts is required'); 1567 | }); 1568 | 1569 | it('should handle executeJS with array of scripts', async () => { 1570 | mockExecuteJS.mockResolvedValue({ 1571 | content: [{ type: 'text', text: 'JS executed' }], 1572 | }); 1573 | 1574 | const result: ToolResult = await server.executeJS({ 1575 | url: 'https://example.com', 1576 | scripts: ['return 1', 'return 2'], 1577 | }); 1578 | 1579 | expect(result.content[0].text).toContain('JavaScript executed on:'); 1580 | }); 1581 | 1582 | it('should handle batchCrawl with cache bypass', async () => { 1583 | mockPost.mockResolvedValue({ 1584 | data: { 1585 | results: [{ success: true }, { success: false }], 1586 | }, 1587 | }); 1588 | 1589 | const result: ToolResult = await server.batchCrawl({ 1590 | urls: ['https://example.com/1', 'https://example.com/2'], 1591 | bypass_cache: true, 1592 | remove_images: true, 1593 | }); 1594 | 1595 | expect(result.content[0].text).toContain('Batch crawl completed'); 1596 | expect(mockPost).toHaveBeenCalledWith( 1597 | '/crawl', 1598 | expect.objectContaining({ 1599 | crawler_config: expect.objectContaining({ 1600 | cache_mode: 'BYPASS', 1601 | exclude_tags: ['img', 'picture', 'svg'], 1602 | }), 1603 | }), 1604 | ); 1605 | }); 1606 | 1607 | it('should handle smart_crawl with follow_links', async () => { 1608 | const axiosClientMock = { 1609 | head: jest.fn().mockResolvedValue({ 1610 | headers: { 'content-type': 'application/xml' }, 1611 | }), 1612 | post: jest.fn().mockResolvedValue({ 1613 | data: { 1614 | results: [ 1615 | { 1616 | url: 'https://example.com/sitemap.xml', 1617 | markdown: { raw_markdown: '<url><loc>https://example.com/page1</loc></url>' }, 1618 | success: true, 1619 | status_code: 200, 1620 | }, 1621 | ], 1622 | }, 1623 | }), 1624 | }; 1625 | server.axiosClientForTesting = axiosClientMock; 1626 | 1627 | const result: ToolResult = await server.smartCrawl({ 1628 | url: 'https://example.com/sitemap.xml', 1629 | follow_links: true, 1630 | }); 1631 | 1632 | expect(result.content[0].text).toContain('Smart crawl detected content type: sitemap'); 1633 | }); 1634 | 1635 | it('should handle smart_crawl with HEAD request failure', async () => { 1636 | const axiosClientMock = { 1637 | head: jest.fn().mockRejectedValue({ response: { status: 500 } }), 1638 | post: jest.fn().mockResolvedValue({ 1639 | data: { 1640 | results: [ 1641 | { 1642 | url: 'https://example.com', 1643 | markdown: { raw_markdown: 'Content from crawl' }, 1644 | success: true, 1645 | status_code: 200, 1646 | }, 1647 | ], 1648 | }, 1649 | }), 1650 | }; 1651 | server.axiosClientForTesting = axiosClientMock; 1652 | 1653 | const result: ToolResult = await server.smartCrawl({ 1654 | url: 'https://example.com', 1655 | }); 1656 | 1657 | // Should continue despite HEAD failure 1658 | expect(result.content[0].text).toContain('Smart crawl detected content type: html'); 1659 | expect(result.content[0].text).toContain('Content from crawl'); 1660 | }); 1661 | 1662 | it('should handle extractLinks with no links', async () => { 1663 | mockPost.mockResolvedValue({ 1664 | data: { 1665 | results: [ 1666 | { 1667 | markdown: 'Content without links', 1668 | }, 1669 | ], 1670 | }, 1671 | }); 1672 | 1673 | const result: ToolResult = await server.extractLinks({ 1674 | url: 'https://example.com', 1675 | categorize: false, 1676 | }); 1677 | 1678 | expect(result.content[0].text).toContain('All links from'); 1679 | }); 1680 | 1681 | it('should handle extractLinks with manually extracted links', async () => { 1682 | mockPost.mockResolvedValue({ 1683 | data: { 1684 | results: [ 1685 | { 1686 | markdown: 'Check out <a href="/page1">Page 1</a>', 1687 | }, 1688 | ], 1689 | }, 1690 | }); 1691 | 1692 | const result: ToolResult = await server.extractLinks({ 1693 | url: 'https://example.com', 1694 | }); 1695 | 1696 | expect(result.content[0].text).toContain('All links from'); 1697 | }); 1698 | 1699 | it('should handle MCP request handler for all tools', async () => { 1700 | // Request handler should be available from beforeEach 1701 | expect(requestHandler).toBeDefined(); 1702 | 1703 | // Test various tools through the request handler 1704 | const tools = [ 1705 | { name: 'get_markdown', args: { url: 'https://example.com' } }, 1706 | { name: 'capture_screenshot', args: { url: 'https://example.com' } }, 1707 | { name: 'generate_pdf', args: { url: 'https://example.com' } }, 1708 | { name: 'execute_js', args: { url: 'https://example.com', scripts: 'return 1' } }, 1709 | { name: 'batch_crawl', args: { urls: ['https://example.com'] } }, 1710 | { name: 'smart_crawl', args: { url: 'https://example.com' } }, 1711 | { name: 'get_html', args: { url: 'https://example.com' } }, 1712 | { name: 'extract_links', args: { url: 'https://example.com' } }, 1713 | { name: 'crawl_recursive', args: { url: 'https://example.com' } }, 1714 | { name: 'parse_sitemap', args: { url: 'https://example.com/sitemap.xml' } }, 1715 | { name: 'crawl', args: { url: 'https://example.com' } }, 1716 | { name: 'manage_session', args: { action: 'create' } }, 1717 | { name: 'manage_session', args: { action: 'clear', session_id: 'test' } }, 1718 | { name: 'manage_session', args: { action: 'list' } }, 1719 | { name: 'extract_with_llm', args: { url: 'https://example.com', prompt: 'test' } }, 1720 | ]; 1721 | 1722 | // Mock all service methods to return success 1723 | mockGetMarkdown.mockResolvedValue({ content: [{ type: 'text', text: 'markdown' }] }); 1724 | mockCaptureScreenshot.mockResolvedValue({ content: [{ type: 'text', text: 'screenshot' }] }); 1725 | mockGeneratePDF.mockResolvedValue({ content: [{ type: 'text', text: 'pdf' }] }); 1726 | mockExecuteJS.mockResolvedValue({ content: [{ type: 'text', text: 'js' }] }); 1727 | mockBatchCrawl.mockResolvedValue({ content: [{ type: 'text', text: 'batch' }] }); 1728 | mockGetHTML.mockResolvedValue({ content: [{ type: 'text', text: 'html' }] }); 1729 | mockExtractWithLLM.mockResolvedValue({ content: [{ type: 'text', text: 'llm' }] }); 1730 | mockCrawl.mockResolvedValue({ 1731 | success: true, 1732 | results: [ 1733 | { 1734 | url: 'https://example.com', 1735 | markdown: { raw_markdown: 'content' }, 1736 | success: true, 1737 | status_code: 200, 1738 | }, 1739 | ], 1740 | }); 1741 | mockPost.mockResolvedValue({ 1742 | data: { 1743 | results: [ 1744 | { 1745 | links: { internal: [], external: [] }, 1746 | }, 1747 | ], 1748 | }, 1749 | }); 1750 | 1751 | mockParseSitemap.mockResolvedValue(['https://example.com/page1']); 1752 | 1753 | // Test each tool 1754 | for (const tool of tools) { 1755 | const result = await requestHandler({ 1756 | method: 'tools/call', 1757 | params: { 1758 | name: tool.name, 1759 | arguments: tool.args, 1760 | }, 1761 | }); 1762 | expect(result).toBeDefined(); 1763 | expect(result.content).toBeDefined(); 1764 | } 1765 | 1766 | // Test unknown tool 1767 | const unknownResult = await requestHandler({ 1768 | method: 'tools/call', 1769 | params: { 1770 | name: 'unknown_tool', 1771 | arguments: {}, 1772 | }, 1773 | }); 1774 | expect(unknownResult.content[0].text).toContain('Error: Unknown tool'); 1775 | 1776 | // The handler only handles tools/call requests, 1777 | // so we don't test other methods here 1778 | }); 1779 | 1780 | it('should handle MCP request handler validation errors', async () => { 1781 | expect(requestHandler).toBeDefined(); 1782 | 1783 | // Test validation errors for various tools 1784 | const invalidRequests = [ 1785 | { name: 'get_markdown', args: {} }, // missing url 1786 | { name: 'capture_screenshot', args: {} }, // missing url 1787 | { name: 'generate_pdf', args: {} }, // missing url 1788 | { name: 'execute_js', args: { url: 'https://example.com' } }, // missing scripts 1789 | { name: 'batch_crawl', args: {} }, // missing urls 1790 | { name: 'smart_crawl', args: {} }, // missing url 1791 | { name: 'get_html', args: {} }, // missing url 1792 | { name: 'extract_links', args: {} }, // missing url 1793 | { name: 'crawl_recursive', args: {} }, // missing url 1794 | { name: 'parse_sitemap', args: {} }, // missing url 1795 | { name: 'crawl', args: {} }, // missing url 1796 | { name: 'manage_session', args: {} }, // missing action 1797 | { name: 'manage_session', args: { action: 'clear' } }, // missing session_id for clear 1798 | { name: 'manage_session', args: { action: 'invalid' } }, // invalid action 1799 | { name: 'extract_with_llm', args: { url: 'https://example.com' } }, // missing prompt 1800 | ]; 1801 | 1802 | for (const req of invalidRequests) { 1803 | const result = await requestHandler({ 1804 | method: 'tools/call', 1805 | params: { 1806 | name: req.name, 1807 | arguments: req.args, 1808 | }, 1809 | }); 1810 | expect(result.content[0].text).toContain(`Error: Invalid parameters for ${req.name}`); 1811 | } 1812 | }); 1813 | 1814 | it('should handle crawl with all output types', async () => { 1815 | mockCrawl.mockResolvedValue({ 1816 | success: true, 1817 | results: [ 1818 | { 1819 | url: 'https://example.com', 1820 | extracted_content: { data: 'extracted' }, 1821 | screenshot: 'base64screenshot', 1822 | pdf: 'base64pdf', 1823 | success: true, 1824 | status_code: 200, 1825 | }, 1826 | ], 1827 | }); 1828 | 1829 | const result: ToolResult = await server.crawl({ 1830 | url: 'https://example.com', 1831 | screenshot: true, 1832 | pdf: true, 1833 | }); 1834 | 1835 | expect(result.content.some((c) => c.type === 'text')).toBe(true); 1836 | expect(result.content.some((c) => c.type === 'image')).toBe(true); 1837 | expect(result.content.some((c) => c.type === 'resource' && c.resource?.mimeType === 'application/pdf')).toBe( 1838 | true, 1839 | ); 1840 | }); 1841 | }); 1842 | 1843 | describe('MCP Protocol Handler Tests', () => { 1844 | it('should handle tools/list request', async () => { 1845 | // Find the tools/list handler 1846 | const toolsListHandler = mockSetRequestHandler.mock.calls.find( 1847 | (call) => (call[0] as any).method === 'tools/list', 1848 | )?.[1]; 1849 | 1850 | expect(toolsListHandler).toBeDefined(); 1851 | 1852 | const result = await (toolsListHandler as any)({ method: 'tools/list', params: {} }); // eslint-disable-line @typescript-eslint/no-explicit-any 1853 | expect(result).toBeDefined(); 1854 | expect(result.tools).toBeDefined(); 1855 | expect(result.tools.length).toBe(13); // Should have 13 tools 1856 | }); 1857 | 1858 | it('should handle get_markdown query functionality', async () => { 1859 | mockGetMarkdown.mockResolvedValue({ 1860 | url: 'https://example.com', 1861 | filter: 'fit', 1862 | query: 'What products are listed?', 1863 | cache: 'false', 1864 | markdown: 'Page content about products', 1865 | success: true, 1866 | }); 1867 | 1868 | const result: ToolResult = await server.getMarkdown({ 1869 | url: 'https://example.com', 1870 | query: 'What products are listed?', 1871 | }); 1872 | 1873 | expect(result.content[0].text).toContain('Query: What products are listed?'); 1874 | expect(result.content[0].text).toContain('Page content about products'); 1875 | }); 1876 | }); 1877 | }); 1878 | }); 1879 | ```